diff --git a/modules/source/08_optimizers/optimizers_dev.ipynb b/modules/source/08_optimizers/optimizers_dev.ipynb
new file mode 100644
index 00000000..223efc4a
--- /dev/null
+++ b/modules/source/08_optimizers/optimizers_dev.ipynb
@@ -0,0 +1,1754 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "602ba54a",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Module 8: Optimizers - Gradient-Based Parameter Updates\n",
+    "\n",
+    "Welcome to the Optimizers module! This is where neural networks learn to improve through intelligent parameter updates.\n",
+    "\n",
+    "## Learning Goals\n",
+    "- Understand gradient descent and how optimizers use gradients to update parameters\n",
+    "- Implement SGD with momentum for accelerated convergence\n",
+    "- Build Adam optimizer with adaptive learning rates\n",
+    "- Master learning rate scheduling strategies\n",
+    "- See how optimizers enable effective neural network training\n",
+    "\n",
+    "## Build → Use → Analyze\n",
+    "1. **Build**: Core optimization algorithms (SGD, Adam)\n",
+    "2. **Use**: Apply optimizers to train neural networks\n",
+    "3. **Analyze**: Compare optimizer behavior and convergence patterns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3b359ed",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "optimizers-imports",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| default_exp core.optimizers\n",
+    "\n",
+    "#| export\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import sys\n",
+    "import os\n",
+    "from typing import List, Dict, Any, Optional, Union\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "# Helper function to set up import paths\n",
+    "def setup_import_paths():\n",
+    "    \"\"\"Set up import paths for development modules.\"\"\"\n",
+    "    import sys\n",
+    "    import os\n",
+    "    \n",
+    "    # Add module directories to path\n",
+    "    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
+    "    tensor_dir = os.path.join(base_dir, '01_tensor')\n",
+    "    autograd_dir = os.path.join(base_dir, '07_autograd')\n",
+    "    \n",
+    "    if tensor_dir not in sys.path:\n",
+    "        sys.path.append(tensor_dir)\n",
+    "    if autograd_dir not in sys.path:\n",
+    "        sys.path.append(autograd_dir)\n",
+    "\n",
+    "# Import our existing components\n",
+    "try:\n",
+    "    from tinytorch.core.tensor import Tensor\n",
+    "    from tinytorch.core.autograd import Variable\n",
+    "except ImportError:\n",
+    "    # For development, try local imports\n",
+    "    try:\n",
+    "        setup_import_paths()\n",
+    "        from tensor_dev import Tensor\n",
+    "        from autograd_dev import Variable\n",
+    "    except ImportError:\n",
+    "        # Create minimal fallback classes for testing\n",
+    "        print(\"Warning: Using fallback classes for testing\")\n",
+    "        \n",
+    "        class Tensor:\n",
+    "            def __init__(self, data):\n",
+    "                self.data = np.array(data)\n",
+    "                self.shape = self.data.shape\n",
+    "            \n",
+    "            def __str__(self):\n",
+    "                return f\"Tensor({self.data})\"\n",
+    "        \n",
+    "        class Variable:\n",
+    "            def __init__(self, data, requires_grad=True):\n",
+    "                if isinstance(data, (int, float)):\n",
+    "                    self.data = Tensor([data])\n",
+    "                else:\n",
+    "                    self.data = Tensor(data)\n",
+    "                self.requires_grad = requires_grad\n",
+    "                self.grad = None\n",
+    "            \n",
+    "            def zero_grad(self):\n",
+    "                self.grad = None\n",
+    "            \n",
+    "            def __str__(self):\n",
+    "                return f\"Variable({self.data.data})\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4dfb6aa4",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "optimizers-setup",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(\"🔥 TinyTorch Optimizers Module\")\n",
+    "print(f\"NumPy version: {np.__version__}\")\n",
+    "print(f\"Python version: {sys.version_info.major}.{sys.version_info.minor}\")\n",
+    "print(\"Ready to build optimization algorithms!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9afc185",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "## 📦 Where This Code Lives in the Final Package\n",
+    "\n",
+    "**Learning Side:** You work in `modules/source/08_optimizers/optimizers_dev.py`  \n",
+    "**Building Side:** Code exports to `tinytorch.core.optimizers`\n",
+    "\n",
+    "```python\n",
+    "# Final package structure:\n",
+    "from tinytorch.core.optimizers import SGD, Adam, StepLR  # The optimization engines!\n",
+    "from tinytorch.core.autograd import Variable  # Gradient computation\n",
+    "from tinytorch.core.tensor import Tensor  # Data structures\n",
+    "```\n",
+    "\n",
+    "**Why this matters:**\n",
+    "- **Learning:** Focused module for understanding optimization algorithms\n",
+    "- **Production:** Proper organization like PyTorch's `torch.optim`\n",
+    "- **Consistency:** All optimization algorithms live together in `core.optimizers`\n",
+    "- **Foundation:** Enables effective neural network training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e0d222c6",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "## What Are Optimizers?\n",
+    "\n",
+    "### The Problem: How to Update Parameters\n",
+    "Neural networks learn by updating parameters using gradients:\n",
+    "```\n",
+    "parameter_new = parameter_old - learning_rate * gradient\n",
+    "```\n",
+    "\n",
+    "But **naive gradient descent** has problems:\n",
+    "- **Slow convergence**: Takes many steps to reach optimum\n",
+    "- **Oscillation**: Bounces around valleys without making progress\n",
+    "- **Poor scaling**: Same learning rate for all parameters\n",
+    "\n",
+    "### The Solution: Smart Optimization\n",
+    "**Optimizers** are algorithms that intelligently update parameters:\n",
+    "- **Momentum**: Accelerate convergence by accumulating velocity\n",
+    "- **Adaptive learning rates**: Different learning rates for different parameters\n",
+    "- **Second-order information**: Use curvature to guide updates\n",
+    "\n",
+    "### Real-World Impact\n",
+    "- **SGD**: The foundation of all neural network training\n",
+    "- **Adam**: The default optimizer for most deep learning applications\n",
+    "- **Learning rate scheduling**: Critical for training stability and performance\n",
+    "\n",
+    "### What We'll Build\n",
+    "1. **SGD**: Stochastic Gradient Descent with momentum\n",
+    "2. **Adam**: Adaptive Moment Estimation optimizer\n",
+    "3. **StepLR**: Learning rate scheduling\n",
+    "4. **Integration**: Complete training loop with optimizers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ccea3ce",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "## Step 1: Understanding Gradient Descent\n",
+    "\n",
+    "### What is Gradient Descent?\n",
+    "**Gradient descent** finds the minimum of a function by following the negative gradient:\n",
+    "\n",
+    "```\n",
+    "θ_{t+1} = θ_t - α ∇f(θ_t)\n",
+    "```\n",
+    "\n",
+    "Where:\n",
+    "- θ: Parameters we want to optimize\n",
+    "- α: Learning rate (how big steps to take)\n",
+    "- ∇f(θ): Gradient of loss function with respect to parameters\n",
+    "\n",
+    "### Why Gradient Descent Works\n",
+    "1. **Gradients point uphill**: Negative gradient points toward minimum\n",
+    "2. **Iterative improvement**: Each step reduces the loss (in theory)\n",
+    "3. **Local convergence**: Finds local minimum with proper learning rate\n",
+    "4. **Scalable**: Works with millions of parameters\n",
+    "\n",
+    "### The Learning Rate Dilemma\n",
+    "- **Too large**: Overshoots minimum, diverges\n",
+    "- **Too small**: Extremely slow convergence\n",
+    "- **Just right**: Steady progress toward minimum\n",
+    "\n",
+    "### Visual Understanding\n",
+    "```\n",
+    "Loss landscape: \\__/\n",
+    "Start here: ↑\n",
+    "Gradient descent: ↓ → ↓ → ↓ → minimum\n",
+    "```\n",
+    "\n",
+    "### Real-World Applications\n",
+    "- **Neural networks**: Training any deep learning model\n",
+    "- **Machine learning**: Logistic regression, SVM, etc.\n",
+    "- **Scientific computing**: Optimization problems in physics, engineering\n",
+    "- **Economics**: Portfolio optimization, game theory\n",
+    "\n",
+    "Let's implement gradient descent to understand it deeply!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d41c2596",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "gradient-descent-function",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def gradient_descent_step(parameter: Variable, learning_rate: float) -> None:\n",
+    "    \"\"\"\n",
+    "    Perform one step of gradient descent on a parameter.\n",
+    "    \n",
+    "    Args:\n",
+    "        parameter: Variable with gradient information\n",
+    "        learning_rate: How much to update parameter\n",
+    "    \n",
+    "    TODO: Implement basic gradient descent parameter update.\n",
+    "    \n",
+    "    STEP-BY-STEP IMPLEMENTATION:\n",
+    "    1. Check if parameter has a gradient\n",
+    "    2. Get current parameter value and gradient\n",
+    "    3. Update parameter: new_value = old_value - learning_rate * gradient\n",
+    "    4. Update parameter data with new value\n",
+    "    5. Handle edge cases (no gradient, invalid values)\n",
+    "    \n",
+    "    EXAMPLE USAGE:\n",
+    "    ```python\n",
+    "    # Parameter with gradient\n",
+    "    w = Variable(2.0, requires_grad=True)\n",
+    "    w.grad = Variable(0.5)  # Gradient from loss\n",
+    "    \n",
+    "    # Update parameter\n",
+    "    gradient_descent_step(w, learning_rate=0.1)\n",
+    "    # w.data now contains: 2.0 - 0.1 * 0.5 = 1.95\n",
+    "    ```\n",
+    "    \n",
+    "    IMPLEMENTATION HINTS:\n",
+    "    - Check if parameter.grad is not None\n",
+    "    - Use parameter.grad.data.data to get gradient value\n",
+    "    - Update parameter.data with new Tensor\n",
+    "    - Don't modify gradient (it's used for logging)\n",
+    "    \n",
+    "    LEARNING CONNECTIONS:\n",
+    "    - This is the foundation of all neural network training\n",
+    "    - PyTorch's optimizer.step() does exactly this\n",
+    "    - The learning rate determines convergence speed\n",
+    "    \"\"\"\n",
+    "    ### BEGIN SOLUTION\n",
+    "    if parameter.grad is not None:\n",
+    "        # Get current parameter value and gradient\n",
+    "        current_value = parameter.data.data\n",
+    "        gradient_value = parameter.grad.data.data\n",
+    "        \n",
+    "        # Update parameter: new_value = old_value - learning_rate * gradient\n",
+    "        new_value = current_value - learning_rate * gradient_value\n",
+    "        \n",
+    "        # Update parameter data\n",
+    "        parameter.data = Tensor(new_value)\n",
+    "    ### END SOLUTION"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d2e1fd4",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Gradient Descent Step\n",
+    "\n",
+    "Let's test your gradient descent implementation right away! This is the foundation of all optimization algorithms.\n",
+    "\n",
+    "**This is a unit test** - it tests one specific function (gradient_descent_step) in isolation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f092d289",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-gradient-descent",
+     "locked": true,
+     "points": 10,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_gradient_descent_step_comprehensive():\n",
+    "    \"\"\"Test basic gradient descent parameter update\"\"\"\n",
+    "    print(\"🔬 Unit Test: Gradient Descent Step...\")\n",
+    "    \n",
+    "    # Test basic parameter update\n",
+    "    try:\n",
+    "        w = Variable(2.0, requires_grad=True)\n",
+    "        w.grad = Variable(0.5)  # Positive gradient\n",
+    "        \n",
+    "        original_value = w.data.data.item()\n",
+    "        gradient_descent_step(w, learning_rate=0.1)\n",
+    "        new_value = w.data.data.item()\n",
+    "        \n",
+    "        expected_value = original_value - 0.1 * 0.5  # 2.0 - 0.05 = 1.95\n",
+    "        assert abs(new_value - expected_value) < 1e-6, f\"Expected {expected_value}, got {new_value}\"\n",
+    "        print(\"✅ Basic parameter update works\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Basic parameter update failed: {e}\")\n",
+    "        raise\n",
+    "\n",
+    "    # Test with negative gradient\n",
+    "    try:\n",
+    "        w2 = Variable(1.0, requires_grad=True)\n",
+    "        w2.grad = Variable(-0.2)  # Negative gradient\n",
+    "        \n",
+    "        gradient_descent_step(w2, learning_rate=0.1)\n",
+    "        expected_value2 = 1.0 - 0.1 * (-0.2)  # 1.0 + 0.02 = 1.02\n",
+    "        assert abs(w2.data.data.item() - expected_value2) < 1e-6, \"Negative gradient test failed\"\n",
+    "        print(\"✅ Negative gradient handling works\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Negative gradient handling failed: {e}\")\n",
+    "        raise\n",
+    "\n",
+    "    # Test with no gradient (should not update)\n",
+    "    try:\n",
+    "        w3 = Variable(3.0, requires_grad=True)\n",
+    "        w3.grad = None\n",
+    "        original_value3 = w3.data.data.item()\n",
+    "        \n",
+    "        gradient_descent_step(w3, learning_rate=0.1)\n",
+    "        assert w3.data.data.item() == original_value3, \"Parameter with no gradient should not update\"\n",
+    "        print(\"✅ No gradient case works\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ No gradient case failed: {e}\")\n",
+    "        raise\n",
+    "\n",
+    "    print(\"🎯 Gradient descent step behavior:\")\n",
+    "    print(\"   Updates parameters in negative gradient direction\")\n",
+    "    print(\"   Uses learning rate to control step size\")\n",
+    "    print(\"   Skips updates when gradient is None\")\n",
+    "    print(\"📈 Progress: Gradient Descent Step ✓\")\n",
+    "\n",
+    "# Test function is called by auto-discovery system"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc218834",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "## Step 2: SGD with Momentum\n",
+    "\n",
+    "### What is SGD?\n",
+    "**SGD (Stochastic Gradient Descent)** is the fundamental optimization algorithm:\n",
+    "\n",
+    "```\n",
+    "θ_{t+1} = θ_t - α ∇L(θ_t)\n",
+    "```\n",
+    "\n",
+    "### The Problem with Vanilla SGD\n",
+    "- **Slow convergence**: Especially in narrow valleys\n",
+    "- **Oscillation**: Bounces around without making progress\n",
+    "- **Poor conditioning**: Struggles with ill-conditioned problems\n",
+    "\n",
+    "### The Solution: Momentum\n",
+    "**Momentum** accumulates velocity to accelerate convergence:\n",
+    "\n",
+    "```\n",
+    "v_t = β v_{t-1} + ∇L(θ_t)\n",
+    "θ_{t+1} = θ_t - α v_t\n",
+    "```\n",
+    "\n",
+    "Where:\n",
+    "- v_t: Velocity (exponential moving average of gradients)\n",
+    "- β: Momentum coefficient (typically 0.9)\n",
+    "- α: Learning rate\n",
+    "\n",
+    "### Why Momentum Works\n",
+    "1. **Acceleration**: Builds up speed in consistent directions\n",
+    "2. **Dampening**: Reduces oscillations in inconsistent directions\n",
+    "3. **Memory**: Remembers previous gradient directions\n",
+    "4. **Robustness**: Less sensitive to noisy gradients\n",
+    "\n",
+    "### Visual Understanding\n",
+    "```\n",
+    "Without momentum: ↗↙↗↙↗↙ (oscillating)\n",
+    "With momentum:    ↗→→→→→ (smooth progress)\n",
+    "```\n",
+    "\n",
+    "### Real-World Applications\n",
+    "- **Image classification**: Training ResNet, VGG\n",
+    "- **Natural language**: Training RNNs, early transformers\n",
+    "- **Classic choice**: Still used when Adam fails\n",
+    "- **Large batch training**: Often preferred over Adam\n",
+    "\n",
+    "Let's implement SGD with momentum!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f587b7f",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "sgd-class",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class SGD:\n",
+    "    \"\"\"\n",
+    "    SGD Optimizer with Momentum\n",
+    "    \n",
+    "    Implements stochastic gradient descent with momentum:\n",
+    "    v_t = momentum * v_{t-1} + gradient\n",
+    "    parameter = parameter - learning_rate * v_t\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, parameters: List[Variable], learning_rate: float = 0.01, \n",
+    "                 momentum: float = 0.0, weight_decay: float = 0.0):\n",
+    "        \"\"\"\n",
+    "        Initialize SGD optimizer.\n",
+    "        \n",
+    "        Args:\n",
+    "            parameters: List of Variables to optimize\n",
+    "            learning_rate: Learning rate (default: 0.01)\n",
+    "            momentum: Momentum coefficient (default: 0.0)\n",
+    "            weight_decay: L2 regularization coefficient (default: 0.0)\n",
+    "        \n",
+    "        TODO: Implement SGD optimizer initialization.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Store parameters and hyperparameters\n",
+    "        2. Initialize momentum buffers for each parameter\n",
+    "        3. Set up state tracking for optimization\n",
+    "        4. Prepare for step() and zero_grad() methods\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        ```python\n",
+    "        # Create optimizer\n",
+    "        optimizer = SGD([w1, w2, b1, b2], learning_rate=0.01, momentum=0.9)\n",
+    "        \n",
+    "        # In training loop:\n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        ```\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Store parameters as a list\n",
+    "        - Initialize momentum buffers as empty dict\n",
+    "        - Use parameter id() as key for momentum tracking\n",
+    "        - Momentum buffers will be created lazily in step()\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        self.parameters = parameters\n",
+    "        self.learning_rate = learning_rate\n",
+    "        self.momentum = momentum\n",
+    "        self.weight_decay = weight_decay\n",
+    "        \n",
+    "        # Initialize momentum buffers (created lazily)\n",
+    "        self.momentum_buffers = {}\n",
+    "        \n",
+    "        # Track optimization steps\n",
+    "        self.step_count = 0\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def step(self) -> None:\n",
+    "        \"\"\"\n",
+    "        Perform one optimization step.\n",
+    "        \n",
+    "        TODO: Implement SGD parameter update with momentum.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Iterate through all parameters\n",
+    "        2. For each parameter with gradient:\n",
+    "           a. Get current gradient\n",
+    "           b. Apply weight decay if specified\n",
+    "           c. Update momentum buffer (or create if first time)\n",
+    "           d. Update parameter using momentum\n",
+    "        3. Increment step count\n",
+    "        \n",
+    "        MATHEMATICAL FORMULATION:\n",
+    "        - If weight_decay > 0: gradient = gradient + weight_decay * parameter\n",
+    "        - momentum_buffer = momentum * momentum_buffer + gradient\n",
+    "        - parameter = parameter - learning_rate * momentum_buffer\n",
+    "        \n",
+    "        IMPLEMENTATION HINTS:\n",
+    "        - Use id(param) as key for momentum buffers\n",
+    "        - Initialize buffer with zeros if not exists\n",
+    "        - Handle case where momentum = 0 (no momentum)\n",
+    "        - Update parameter.data with new Tensor\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        for param in self.parameters:\n",
+    "            if param.grad is not None:\n",
+    "                # Get gradient\n",
+    "                gradient = param.grad.data.data\n",
+    "                \n",
+    "                # Apply weight decay (L2 regularization)\n",
+    "                if self.weight_decay > 0:\n",
+    "                    gradient = gradient + self.weight_decay * param.data.data\n",
+    "                \n",
+    "                # Get or create momentum buffer\n",
+    "                param_id = id(param)\n",
+    "                if param_id not in self.momentum_buffers:\n",
+    "                    self.momentum_buffers[param_id] = np.zeros_like(param.data.data)\n",
+    "                \n",
+    "                # Update momentum buffer\n",
+    "                self.momentum_buffers[param_id] = (\n",
+    "                    self.momentum * self.momentum_buffers[param_id] + gradient\n",
+    "                )\n",
+    "                \n",
+    "                # Update parameter\n",
+    "                param.data = Tensor(\n",
+    "                    param.data.data - self.learning_rate * self.momentum_buffers[param_id]\n",
+    "                )\n",
+    "        \n",
+    "        self.step_count += 1\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def zero_grad(self) -> None:\n",
+    "        \"\"\"\n",
+    "        Zero out gradients for all parameters.\n",
+    "        \n",
+    "        TODO: Implement gradient zeroing.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Iterate through all parameters\n",
+    "        2. Set gradient to None for each parameter\n",
+    "        3. This prepares for next backward pass\n",
+    "        \n",
+    "        IMPLEMENTATION HINTS:\n",
+    "        - Simply set param.grad = None\n",
+    "        - This is called before loss.backward()\n",
+    "        - Essential for proper gradient accumulation\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        for param in self.parameters:\n",
+    "            param.grad = None\n",
+    "        ### END SOLUTION"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4adee99c",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: SGD Optimizer\n",
+    "\n",
+    "Let's test your SGD optimizer implementation! This optimizer adds momentum to gradient descent for better convergence.\n",
+    "\n",
+    "**This is a unit test** - it tests one specific class (SGD) in isolation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa93aa53",
+   "metadata": {
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-sgd",
+     "locked": true,
+     "points": 15,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_sgd_optimizer_comprehensive():\n",
+    "    \"\"\"Test SGD optimizer implementation\"\"\"\n",
+    "    print(\"🔬 Unit Test: SGD Optimizer...\")\n",
+    "    \n",
+    "    # Create test parameters\n",
+    "    w1 = Variable(1.0, requires_grad=True)\n",
+    "    w2 = Variable(2.0, requires_grad=True)\n",
+    "    b = Variable(0.5, requires_grad=True)\n",
+    "    \n",
+    "    # Create optimizer\n",
+    "    optimizer = SGD([w1, w2, b], learning_rate=0.1, momentum=0.9)\n",
+    "    \n",
+    "    # Test zero_grad\n",
+    "    try:\n",
+    "        w1.grad = Variable(0.1)\n",
+    "        w2.grad = Variable(0.2)\n",
+    "        b.grad = Variable(0.05)\n",
+    "        \n",
+    "        optimizer.zero_grad()\n",
+    "        \n",
+    "        assert w1.grad is None, \"Gradient should be None after zero_grad\"\n",
+    "        assert w2.grad is None, \"Gradient should be None after zero_grad\"\n",
+    "        assert b.grad is None, \"Gradient should be None after zero_grad\"\n",
+    "        print(\"✅ zero_grad() works correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ zero_grad() failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test step with gradients\n",
+    "    try:\n",
+    "        w1.grad = Variable(0.1)\n",
+    "        w2.grad = Variable(0.2)\n",
+    "        b.grad = Variable(0.05)\n",
+    "        \n",
+    "        # First step (no momentum yet)\n",
+    "        original_w1 = w1.data.data.item()\n",
+    "        original_w2 = w2.data.data.item()\n",
+    "        original_b = b.data.data.item()\n",
+    "        \n",
+    "        optimizer.step()\n",
+    "        \n",
+    "        # Check parameter updates\n",
+    "        expected_w1 = original_w1 - 0.1 * 0.1  # 1.0 - 0.01 = 0.99\n",
+    "        expected_w2 = original_w2 - 0.1 * 0.2  # 2.0 - 0.02 = 1.98\n",
+    "        expected_b = original_b - 0.1 * 0.05   # 0.5 - 0.005 = 0.495\n",
+    "        \n",
+    "        assert abs(w1.data.data.item() - expected_w1) < 1e-6, f\"w1 update failed: expected {expected_w1}, got {w1.data.data.item()}\"\n",
+    "        assert abs(w2.data.data.item() - expected_w2) < 1e-6, f\"w2 update failed: expected {expected_w2}, got {w2.data.data.item()}\"\n",
+    "        assert abs(b.data.data.item() - expected_b) < 1e-6, f\"b update failed: expected {expected_b}, got {b.data.data.item()}\"\n",
+    "        print(\"✅ Parameter updates work correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Parameter updates failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test momentum buffers\n",
+    "    try:\n",
+    "        assert len(optimizer.momentum_buffers) == 3, f\"Should have 3 momentum buffers, got {len(optimizer.momentum_buffers)}\"\n",
+    "        assert optimizer.step_count == 1, f\"Step count should be 1, got {optimizer.step_count}\"\n",
+    "        print(\"✅ Momentum buffers created correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Momentum buffers failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test step counting\n",
+    "    try:\n",
+    "        w1.grad = Variable(0.1)\n",
+    "        w2.grad = Variable(0.2)\n",
+    "        b.grad = Variable(0.05)\n",
+    "        \n",
+    "        optimizer.step()\n",
+    "        \n",
+    "        assert optimizer.step_count == 2, f\"Step count should be 2, got {optimizer.step_count}\"\n",
+    "        print(\"✅ Step counting works correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Step counting failed: {e}\")\n",
+    "        raise\n",
+    "\n",
+    "    print(\"🎯 SGD optimizer behavior:\")\n",
+    "    print(\"   Maintains momentum buffers for accelerated updates\")\n",
+    "    print(\"   Tracks step count for learning rate scheduling\")\n",
+    "    print(\"   Supports weight decay for regularization\")\n",
+    "    print(\"📈 Progress: SGD Optimizer ✓\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_sgd_optimizer_comprehensive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3730c6d6",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "## Step 3: Adam - Adaptive Learning Rates\n",
+    "\n",
+    "### What is Adam?\n",
+    "**Adam (Adaptive Moment Estimation)** is the most popular optimizer in deep learning:\n",
+    "\n",
+    "```\n",
+    "m_t = β₁ m_{t-1} + (1 - β₁) ∇L(θ_t)        # First moment (momentum)\n",
+    "v_t = β₂ v_{t-1} + (1 - β₂) (∇L(θ_t))²     # Second moment (variance)\n",
+    "m̂_t = m_t / (1 - β₁ᵗ)                      # Bias correction\n",
+    "v̂_t = v_t / (1 - β₂ᵗ)                      # Bias correction\n",
+    "θ_{t+1} = θ_t - α m̂_t / (√v̂_t + ε)        # Parameter update\n",
+    "```\n",
+    "\n",
+    "### Why Adam is Revolutionary\n",
+    "1. **Adaptive learning rates**: Different learning rate for each parameter\n",
+    "2. **Momentum**: Accelerates convergence like SGD\n",
+    "3. **Variance adaptation**: Scales updates based on gradient variance\n",
+    "4. **Bias correction**: Handles initialization bias\n",
+    "5. **Robust**: Works well with minimal hyperparameter tuning\n",
+    "\n",
+    "### The Three Key Ideas\n",
+    "1. **First moment (m_t)**: Exponential moving average of gradients (momentum)\n",
+    "2. **Second moment (v_t)**: Exponential moving average of squared gradients (variance)\n",
+    "3. **Adaptive scaling**: Large gradients → small updates, small gradients → large updates\n",
+    "\n",
+    "### Visual Understanding\n",
+    "```\n",
+    "Parameter with large gradients: /\\/\\/\\/\\ → smooth updates\n",
+    "Parameter with small gradients: ______ → amplified updates\n",
+    "```\n",
+    "\n",
+    "### Real-World Applications\n",
+    "- **Deep learning**: Default optimizer for most neural networks\n",
+    "- **Computer vision**: Training CNNs, ResNets, Vision Transformers\n",
+    "- **Natural language**: Training BERT, GPT, T5\n",
+    "- **Transformers**: Essential for attention-based models\n",
+    "\n",
+    "Let's implement Adam optimizer!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be7d3f7a",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "adam-class",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class Adam:\n",
+    "    \"\"\"\n",
+    "    Adam Optimizer\n",
+    "    \n",
+    "    Implements Adam algorithm with adaptive learning rates:\n",
+    "    - First moment: exponential moving average of gradients\n",
+    "    - Second moment: exponential moving average of squared gradients\n",
+    "    - Bias correction: accounts for initialization bias\n",
+    "    - Adaptive updates: different learning rate per parameter\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, parameters: List[Variable], learning_rate: float = 0.001,\n",
+    "                 beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-8,\n",
+    "                 weight_decay: float = 0.0):\n",
+    "        \"\"\"\n",
+    "        Initialize Adam optimizer.\n",
+    "        \n",
+    "        Args:\n",
+    "            parameters: List of Variables to optimize\n",
+    "            learning_rate: Learning rate (default: 0.001)\n",
+    "            beta1: Exponential decay rate for first moment (default: 0.9)\n",
+    "            beta2: Exponential decay rate for second moment (default: 0.999)\n",
+    "            epsilon: Small constant for numerical stability (default: 1e-8)\n",
+    "            weight_decay: L2 regularization coefficient (default: 0.0)\n",
+    "        \n",
+    "        TODO: Implement Adam optimizer initialization.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Store parameters and hyperparameters\n",
+    "        2. Initialize first moment buffers (m_t)\n",
+    "        3. Initialize second moment buffers (v_t)\n",
+    "        4. Set up step counter for bias correction\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        ```python\n",
+    "        # Create Adam optimizer\n",
+    "        optimizer = Adam([w1, w2, b1, b2], learning_rate=0.001)\n",
+    "        \n",
+    "        # In training loop:\n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        ```\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Store all hyperparameters\n",
+    "        - Initialize moment buffers as empty dicts\n",
+    "        - Use parameter id() as key for tracking\n",
+    "        - Buffers will be created lazily in step()\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        self.parameters = parameters\n",
+    "        self.learning_rate = learning_rate\n",
+    "        self.beta1 = beta1\n",
+    "        self.beta2 = beta2\n",
+    "        self.epsilon = epsilon\n",
+    "        self.weight_decay = weight_decay\n",
+    "        \n",
+    "        # Initialize moment buffers (created lazily)\n",
+    "        self.first_moment = {}   # m_t\n",
+    "        self.second_moment = {}  # v_t\n",
+    "        \n",
+    "        # Track optimization steps for bias correction\n",
+    "        self.step_count = 0\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def step(self) -> None:\n",
+    "        \"\"\"\n",
+    "        Perform one optimization step using Adam algorithm.\n",
+    "        \n",
+    "        TODO: Implement Adam parameter update.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Increment step count\n",
+    "        2. For each parameter with gradient:\n",
+    "           a. Get current gradient\n",
+    "           b. Apply weight decay if specified\n",
+    "           c. Update first moment (momentum)\n",
+    "           d. Update second moment (variance)\n",
+    "           e. Apply bias correction\n",
+    "           f. Update parameter with adaptive learning rate\n",
+    "        \n",
+    "        MATHEMATICAL FORMULATION:\n",
+    "        - m_t = beta1 * m_{t-1} + (1 - beta1) * gradient\n",
+    "        - v_t = beta2 * v_{t-1} + (1 - beta2) * gradient^2\n",
+    "        - m_hat = m_t / (1 - beta1^t)\n",
+    "        - v_hat = v_t / (1 - beta2^t)\n",
+    "        - parameter = parameter - learning_rate * m_hat / (sqrt(v_hat) + epsilon)\n",
+    "        \n",
+    "        IMPLEMENTATION HINTS:\n",
+    "        - Use id(param) as key for moment buffers\n",
+    "        - Initialize buffers with zeros if not exists\n",
+    "        - Use np.sqrt() for square root\n",
+    "        - Handle numerical stability with epsilon\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        self.step_count += 1\n",
+    "        \n",
+    "        for param in self.parameters:\n",
+    "            if param.grad is not None:\n",
+    "                # Get gradient\n",
+    "                gradient = param.grad.data.data\n",
+    "                \n",
+    "                # Apply weight decay (L2 regularization)\n",
+    "                if self.weight_decay > 0:\n",
+    "                    gradient = gradient + self.weight_decay * param.data.data\n",
+    "                \n",
+    "                # Get or create moment buffers\n",
+    "                param_id = id(param)\n",
+    "                if param_id not in self.first_moment:\n",
+    "                    self.first_moment[param_id] = np.zeros_like(param.data.data)\n",
+    "                    self.second_moment[param_id] = np.zeros_like(param.data.data)\n",
+    "                \n",
+    "                # Update first moment (momentum)\n",
+    "                self.first_moment[param_id] = (\n",
+    "                    self.beta1 * self.first_moment[param_id] + \n",
+    "                    (1 - self.beta1) * gradient\n",
+    "                )\n",
+    "                \n",
+    "                # Update second moment (variance)\n",
+    "                self.second_moment[param_id] = (\n",
+    "                    self.beta2 * self.second_moment[param_id] + \n",
+    "                    (1 - self.beta2) * gradient * gradient\n",
+    "                )\n",
+    "                \n",
+    "                # Bias correction\n",
+    "                first_moment_corrected = (\n",
+    "                    self.first_moment[param_id] / (1 - self.beta1 ** self.step_count)\n",
+    "                )\n",
+    "                second_moment_corrected = (\n",
+    "                    self.second_moment[param_id] / (1 - self.beta2 ** self.step_count)\n",
+    "                )\n",
+    "                \n",
+    "                # Update parameter with adaptive learning rate\n",
+    "                param.data = Tensor(\n",
+    "                    param.data.data - self.learning_rate * first_moment_corrected / \n",
+    "                    (np.sqrt(second_moment_corrected) + self.epsilon)\n",
+    "                )\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def zero_grad(self) -> None:\n",
+    "        \"\"\"\n",
+    "        Zero out gradients for all parameters.\n",
+    "        \n",
+    "        TODO: Implement gradient zeroing (same as SGD).\n",
+    "        \n",
+    "        IMPLEMENTATION HINTS:\n",
+    "        - Set param.grad = None for all parameters\n",
+    "        - This is identical to SGD implementation\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        for param in self.parameters:\n",
+    "            param.grad = None\n",
+    "        ### END SOLUTION"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41593be1",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "### 🧪 Test Your Adam Implementation\n",
+    "\n",
+    "Let's test the Adam optimizer:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "461e74f8",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Adam Optimizer\n",
+    "\n",
+    "Let's test your Adam optimizer implementation! This is a state-of-the-art adaptive optimization algorithm.\n",
+    "\n",
+    "**This is a unit test** - it tests one specific class (Adam) in isolation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afe99df3",
+   "metadata": {
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-adam",
+     "locked": true,
+     "points": 20,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_adam_optimizer_comprehensive():\n",
+    "    \"\"\"Test Adam optimizer implementation\"\"\"\n",
+    "    print(\"🔬 Unit Test: Adam Optimizer...\")\n",
+    "    \n",
+    "    # Create test parameters\n",
+    "    w1 = Variable(1.0, requires_grad=True)\n",
+    "    w2 = Variable(2.0, requires_grad=True)\n",
+    "    b = Variable(0.5, requires_grad=True)\n",
+    "    \n",
+    "    # Create optimizer\n",
+    "    optimizer = Adam([w1, w2, b], learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8)\n",
+    "    \n",
+    "    # Test zero_grad\n",
+    "    try:\n",
+    "        w1.grad = Variable(0.1)\n",
+    "        w2.grad = Variable(0.2)\n",
+    "        b.grad = Variable(0.05)\n",
+    "        \n",
+    "        optimizer.zero_grad()\n",
+    "        \n",
+    "        assert w1.grad is None, \"Gradient should be None after zero_grad\"\n",
+    "        assert w2.grad is None, \"Gradient should be None after zero_grad\"\n",
+    "        assert b.grad is None, \"Gradient should be None after zero_grad\"\n",
+    "        print(\"✅ zero_grad() works correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ zero_grad() failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test step with gradients\n",
+    "    try:\n",
+    "        w1.grad = Variable(0.1)\n",
+    "        w2.grad = Variable(0.2)\n",
+    "        b.grad = Variable(0.05)\n",
+    "        \n",
+    "        # First step\n",
+    "        original_w1 = w1.data.data.item()\n",
+    "        original_w2 = w2.data.data.item()\n",
+    "        original_b = b.data.data.item()\n",
+    "        \n",
+    "        optimizer.step()\n",
+    "        \n",
+    "        # Check that parameters were updated (Adam uses adaptive learning rates)\n",
+    "        assert w1.data.data.item() != original_w1, \"w1 should have been updated\"\n",
+    "        assert w2.data.data.item() != original_w2, \"w2 should have been updated\"\n",
+    "        assert b.data.data.item() != original_b, \"b should have been updated\"\n",
+    "        print(\"✅ Parameter updates work correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Parameter updates failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test moment buffers\n",
+    "    try:\n",
+    "        assert len(optimizer.first_moment) == 3, f\"Should have 3 first moment buffers, got {len(optimizer.first_moment)}\"\n",
+    "        assert len(optimizer.second_moment) == 3, f\"Should have 3 second moment buffers, got {len(optimizer.second_moment)}\"\n",
+    "        print(\"✅ Moment buffers created correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Moment buffers failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test step counting and bias correction\n",
+    "    try:\n",
+    "        assert optimizer.step_count == 1, f\"Step count should be 1, got {optimizer.step_count}\"\n",
+    "        \n",
+    "        # Take another step\n",
+    "        w1.grad = Variable(0.1)\n",
+    "        w2.grad = Variable(0.2)\n",
+    "        b.grad = Variable(0.05)\n",
+    "        \n",
+    "        optimizer.step()\n",
+    "        \n",
+    "        assert optimizer.step_count == 2, f\"Step count should be 2, got {optimizer.step_count}\"\n",
+    "        print(\"✅ Step counting and bias correction work correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Step counting and bias correction failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test adaptive learning rates\n",
+    "    try:\n",
+    "        # Adam should have different effective learning rates for different parameters\n",
+    "        # This is tested implicitly by the parameter updates above\n",
+    "        print(\"✅ Adaptive learning rates work correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Adaptive learning rates failed: {e}\")\n",
+    "        raise\n",
+    "\n",
+    "    print(\"🎯 Adam optimizer behavior:\")\n",
+    "    print(\"   Maintains first and second moment estimates\")\n",
+    "    print(\"   Applies bias correction for early training\")\n",
+    "    print(\"   Uses adaptive learning rates per parameter\")\n",
+    "    print(\"   Combines benefits of momentum and RMSprop\")\n",
+    "    print(\"📈 Progress: Adam Optimizer ✓\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_adam_optimizer_comprehensive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e198d030",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "## Step 4: Learning Rate Scheduling\n",
+    "\n",
+    "### What is Learning Rate Scheduling?\n",
+    "**Learning rate scheduling** adjusts the learning rate during training:\n",
+    "\n",
+    "```\n",
+    "Initial: learning_rate = 0.1\n",
+    "After 10 epochs: learning_rate = 0.01\n",
+    "After 20 epochs: learning_rate = 0.001\n",
+    "```\n",
+    "\n",
+    "### Why Scheduling Matters\n",
+    "1. **Fine-tuning**: Start with large steps, then refine with small steps\n",
+    "2. **Convergence**: Prevents overshooting near optimum\n",
+    "3. **Stability**: Reduces oscillations in later training\n",
+    "4. **Performance**: Often improves final accuracy\n",
+    "\n",
+    "### Common Scheduling Strategies\n",
+    "1. **Step decay**: Reduce by factor every N epochs\n",
+    "2. **Exponential decay**: Gradual exponential reduction\n",
+    "3. **Cosine annealing**: Smooth cosine curve reduction\n",
+    "4. **Warm-up**: Start small, increase, then decrease\n",
+    "\n",
+    "### Visual Understanding\n",
+    "```\n",
+    "Step decay:     ----↓----↓----↓\n",
+    "Exponential:    \\\\\\\\\\\\\\\\\\\\\\\\\\\\\n",
+    "Cosine:         ∩∩∩∩∩∩∩∩∩∩∩∩∩\n",
+    "```\n",
+    "\n",
+    "### Real-World Applications\n",
+    "- **ImageNet training**: Essential for achieving state-of-the-art results\n",
+    "- **Language models**: Critical for training large transformers\n",
+    "- **Fine-tuning**: Prevents catastrophic forgetting\n",
+    "- **Transfer learning**: Adapts pre-trained models\n",
+    "\n",
+    "Let's implement step learning rate scheduling!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7aba8fc9",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "steplr-class",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class StepLR:\n",
+    "    \"\"\"\n",
+    "    Step Learning Rate Scheduler\n",
+    "    \n",
+    "    Decays learning rate by gamma every step_size epochs:\n",
+    "    learning_rate = initial_lr * (gamma ^ (epoch // step_size))\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, optimizer: Union[SGD, Adam], step_size: int, gamma: float = 0.1):\n",
+    "        \"\"\"\n",
+    "        Initialize step learning rate scheduler.\n",
+    "        \n",
+    "        Args:\n",
+    "            optimizer: Optimizer to schedule\n",
+    "            step_size: Number of epochs between decreases\n",
+    "            gamma: Multiplicative factor for learning rate decay\n",
+    "        \n",
+    "        TODO: Implement learning rate scheduler initialization.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Store optimizer reference\n",
+    "        2. Store scheduling parameters\n",
+    "        3. Save initial learning rate\n",
+    "        4. Initialize step counter\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        ```python\n",
+    "        optimizer = SGD([w1, w2], learning_rate=0.1)\n",
+    "        scheduler = StepLR(optimizer, step_size=10, gamma=0.1)\n",
+    "        \n",
+    "        # In training loop:\n",
+    "        for epoch in range(100):\n",
+    "            train_one_epoch()\n",
+    "            scheduler.step()  # Update learning rate\n",
+    "        ```\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Store optimizer reference\n",
+    "        - Save initial learning rate from optimizer\n",
+    "        - Initialize step counter to 0\n",
+    "        - gamma is the decay factor (0.1 = 10x reduction)\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        self.optimizer = optimizer\n",
+    "        self.step_size = step_size\n",
+    "        self.gamma = gamma\n",
+    "        self.initial_lr = optimizer.learning_rate\n",
+    "        self.step_count = 0\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def step(self) -> None:\n",
+    "        \"\"\"\n",
+    "        Update learning rate based on current step.\n",
+    "        \n",
+    "        TODO: Implement learning rate update.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Increment step counter\n",
+    "        2. Calculate new learning rate using step decay formula\n",
+    "        3. Update optimizer's learning rate\n",
+    "        \n",
+    "        MATHEMATICAL FORMULATION:\n",
+    "        new_lr = initial_lr * (gamma ^ ((step_count - 1) // step_size))\n",
+    "        \n",
+    "        IMPLEMENTATION HINTS:\n",
+    "        - Use // for integer division\n",
+    "        - Use ** for exponentiation\n",
+    "        - Update optimizer.learning_rate directly\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        self.step_count += 1\n",
+    "        \n",
+    "        # Calculate new learning rate\n",
+    "        decay_factor = self.gamma ** ((self.step_count - 1) // self.step_size)\n",
+    "        new_lr = self.initial_lr * decay_factor\n",
+    "        \n",
+    "        # Update optimizer's learning rate\n",
+    "        self.optimizer.learning_rate = new_lr\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def get_lr(self) -> float:\n",
+    "        \"\"\"\n",
+    "        Get current learning rate.\n",
+    "        \n",
+    "        TODO: Return current learning rate.\n",
+    "        \n",
+    "        IMPLEMENTATION HINTS:\n",
+    "        - Return optimizer.learning_rate\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        return self.optimizer.learning_rate\n",
+    "        ### END SOLUTION"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51901e5b",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Step Learning Rate Scheduler\n",
+    "\n",
+    "Let's test your step learning rate scheduler implementation! This scheduler reduces learning rate at regular intervals.\n",
+    "\n",
+    "**This is a unit test** - it tests one specific class (StepLR) in isolation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b83de77",
+   "metadata": {
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-step-scheduler",
+     "locked": true,
+     "points": 10,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_step_scheduler_comprehensive():\n",
+    "    \"\"\"Test StepLR scheduler implementation\"\"\"\n",
+    "    print(\"🔬 Unit Test: Step Learning Rate Scheduler...\")\n",
+    "    \n",
+    "    # Create test parameters and optimizer\n",
+    "    w = Variable(1.0, requires_grad=True)\n",
+    "    optimizer = SGD([w], learning_rate=0.1)\n",
+    "    \n",
+    "    # Test scheduler initialization\n",
+    "    try:\n",
+    "        scheduler = StepLR(optimizer, step_size=10, gamma=0.1)\n",
+    "        \n",
+    "        # Test initial learning rate\n",
+    "        assert scheduler.get_lr() == 0.1, f\"Initial learning rate should be 0.1, got {scheduler.get_lr()}\"\n",
+    "        print(\"✅ Initial learning rate is correct\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Initial learning rate failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test step-based decay\n",
+    "    try:\n",
+    "        # Steps 1-10: no decay (decay happens after step 10)\n",
+    "        for i in range(10):\n",
+    "            scheduler.step()\n",
+    "        \n",
+    "        assert scheduler.get_lr() == 0.1, f\"Learning rate should still be 0.1 after 10 steps, got {scheduler.get_lr()}\"\n",
+    "        \n",
+    "        # Step 11: decay should occur\n",
+    "        scheduler.step()\n",
+    "        expected_lr = 0.1 * 0.1  # 0.01\n",
+    "        assert abs(scheduler.get_lr() - expected_lr) < 1e-6, f\"Learning rate should be {expected_lr} after 11 steps, got {scheduler.get_lr()}\"\n",
+    "        print(\"✅ Step-based decay works correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Step-based decay failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test multiple decay levels\n",
+    "    try:\n",
+    "        # Steps 12-20: should stay at 0.01\n",
+    "        for i in range(9):\n",
+    "            scheduler.step()\n",
+    "        \n",
+    "        assert abs(scheduler.get_lr() - 0.01) < 1e-6, f\"Learning rate should be 0.01 after 20 steps, got {scheduler.get_lr()}\"\n",
+    "        \n",
+    "        # Step 21: another decay\n",
+    "        scheduler.step()\n",
+    "        expected_lr = 0.01 * 0.1  # 0.001\n",
+    "        assert abs(scheduler.get_lr() - expected_lr) < 1e-6, f\"Learning rate should be {expected_lr} after 21 steps, got {scheduler.get_lr()}\"\n",
+    "        print(\"✅ Multiple decay levels work correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Multiple decay levels failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test with different optimizer\n",
+    "    try:\n",
+    "        w2 = Variable(2.0, requires_grad=True)\n",
+    "        adam_optimizer = Adam([w2], learning_rate=0.001)\n",
+    "        adam_scheduler = StepLR(adam_optimizer, step_size=5, gamma=0.5)\n",
+    "        \n",
+    "        # Test initial learning rate\n",
+    "        assert adam_scheduler.get_lr() == 0.001, f\"Initial Adam learning rate should be 0.001, got {adam_scheduler.get_lr()}\"\n",
+    "        \n",
+    "        # Test decay after 5 steps\n",
+    "        for i in range(5):\n",
+    "            adam_scheduler.step()\n",
+    "        \n",
+    "        # Learning rate should still be 0.001 after 5 steps\n",
+    "        assert adam_scheduler.get_lr() == 0.001, f\"Adam learning rate should still be 0.001 after 5 steps, got {adam_scheduler.get_lr()}\"\n",
+    "        \n",
+    "        # Step 6: decay should occur\n",
+    "        adam_scheduler.step()\n",
+    "        expected_lr = 0.001 * 0.5  # 0.0005\n",
+    "        assert abs(adam_scheduler.get_lr() - expected_lr) < 1e-6, f\"Adam learning rate should be {expected_lr} after 6 steps, got {adam_scheduler.get_lr()}\"\n",
+    "        print(\"✅ Works with different optimizers\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Different optimizers failed: {e}\")\n",
+    "        raise\n",
+    "\n",
+    "    print(\"🎯 Step learning rate scheduler behavior:\")\n",
+    "    print(\"   Reduces learning rate at regular intervals\")\n",
+    "    print(\"   Multiplies current rate by gamma factor\")\n",
+    "    print(\"   Works with any optimizer (SGD, Adam, etc.)\")\n",
+    "    print(\"📈 Progress: Step Learning Rate Scheduler ✓\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_step_scheduler_comprehensive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2fc52bc2",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "## Step 5: Integration - Complete Training Example\n",
+    "\n",
+    "### Putting It All Together\n",
+    "Let's see how optimizers enable complete neural network training:\n",
+    "\n",
+    "1. **Forward pass**: Compute predictions\n",
+    "2. **Loss computation**: Compare with targets\n",
+    "3. **Backward pass**: Compute gradients\n",
+    "4. **Optimizer step**: Update parameters\n",
+    "5. **Learning rate scheduling**: Adjust learning rate\n",
+    "\n",
+    "### The Modern Training Loop\n",
+    "```python\n",
+    "# Setup\n",
+    "optimizer = Adam(model.parameters(), learning_rate=0.001)\n",
+    "scheduler = StepLR(optimizer, step_size=10, gamma=0.1)\n",
+    "\n",
+    "# Training loop\n",
+    "for epoch in range(num_epochs):\n",
+    "    for batch in dataloader:\n",
+    "        # Forward pass\n",
+    "        predictions = model(batch.inputs)\n",
+    "        loss = criterion(predictions, batch.targets)\n",
+    "        \n",
+    "        # Backward pass\n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "    \n",
+    "    # Update learning rate\n",
+    "    scheduler.step()\n",
+    "```\n",
+    "\n",
+    "Let's implement a complete training example!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3205aad",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "training-integration",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def train_simple_model():\n",
+    "    \"\"\"\n",
+    "    Complete training example using optimizers.\n",
+    "    \n",
+    "    TODO: Implement a complete training loop.\n",
+    "    \n",
+    "    APPROACH:\n",
+    "    1. Create a simple model (linear regression)\n",
+    "    2. Generate training data\n",
+    "    3. Set up optimizer and scheduler\n",
+    "    4. Train for several epochs\n",
+    "    5. Show convergence\n",
+    "    \n",
+    "    LEARNING OBJECTIVE:\n",
+    "    - See how optimizers enable real learning\n",
+    "    - Compare SGD vs Adam performance\n",
+    "    - Understand the complete training workflow\n",
+    "    \"\"\"\n",
+    "    ### BEGIN SOLUTION\n",
+    "    print(\"Training simple linear regression model...\")\n",
+    "    \n",
+    "    # Create simple model: y = w*x + b\n",
+    "    w = Variable(0.1, requires_grad=True)  # Initialize near zero\n",
+    "    b = Variable(0.0, requires_grad=True)\n",
+    "    \n",
+    "    # Training data: y = 2*x + 1\n",
+    "    x_data = [1.0, 2.0, 3.0, 4.0, 5.0]\n",
+    "    y_data = [3.0, 5.0, 7.0, 9.0, 11.0]\n",
+    "    \n",
+    "    # Try SGD first\n",
+    "    print(\"\\n🔍 Training with SGD...\")\n",
+    "    optimizer_sgd = SGD([w, b], learning_rate=0.01, momentum=0.9)\n",
+    "    \n",
+    "    for epoch in range(60):\n",
+    "        total_loss = 0\n",
+    "        \n",
+    "        for x_val, y_val in zip(x_data, y_data):\n",
+    "            # Forward pass\n",
+    "            x = Variable(x_val, requires_grad=False)\n",
+    "            y_target = Variable(y_val, requires_grad=False)\n",
+    "            \n",
+    "            # Prediction: y = w*x + b\n",
+    "            try:\n",
+    "                from tinytorch.core.autograd import add, multiply, subtract\n",
+    "            except ImportError:\n",
+    "                setup_import_paths()\n",
+    "                from autograd_dev import add, multiply, subtract\n",
+    "            \n",
+    "            prediction = add(multiply(w, x), b)\n",
+    "            \n",
+    "            # Loss: (prediction - target)^2\n",
+    "            error = subtract(prediction, y_target)\n",
+    "            loss = multiply(error, error)\n",
+    "            \n",
+    "            # Backward pass\n",
+    "            optimizer_sgd.zero_grad()\n",
+    "            loss.backward()\n",
+    "            optimizer_sgd.step()\n",
+    "            \n",
+    "            total_loss += loss.data.data.item()\n",
+    "        \n",
+    "        if epoch % 10 == 0:\n",
+    "            print(f\"Epoch {epoch}: Loss = {total_loss:.4f}, w = {w.data.data.item():.3f}, b = {b.data.data.item():.3f}\")\n",
+    "    \n",
+    "    sgd_final_w = w.data.data.item()\n",
+    "    sgd_final_b = b.data.data.item()\n",
+    "    \n",
+    "    # Reset parameters and try Adam\n",
+    "    print(\"\\n🔍 Training with Adam...\")\n",
+    "    w.data = Tensor(0.1)\n",
+    "    b.data = Tensor(0.0)\n",
+    "    \n",
+    "    optimizer_adam = Adam([w, b], learning_rate=0.01)\n",
+    "    \n",
+    "    for epoch in range(60):\n",
+    "        total_loss = 0\n",
+    "        \n",
+    "        for x_val, y_val in zip(x_data, y_data):\n",
+    "            # Forward pass\n",
+    "            x = Variable(x_val, requires_grad=False)\n",
+    "            y_target = Variable(y_val, requires_grad=False)\n",
+    "            \n",
+    "            # Prediction: y = w*x + b\n",
+    "            prediction = add(multiply(w, x), b)\n",
+    "            \n",
+    "            # Loss: (prediction - target)^2\n",
+    "            error = subtract(prediction, y_target)\n",
+    "            loss = multiply(error, error)\n",
+    "            \n",
+    "            # Backward pass\n",
+    "            optimizer_adam.zero_grad()\n",
+    "            loss.backward()\n",
+    "            optimizer_adam.step()\n",
+    "            \n",
+    "            total_loss += loss.data.data.item()\n",
+    "        \n",
+    "        if epoch % 10 == 0:\n",
+    "            print(f\"Epoch {epoch}: Loss = {total_loss:.4f}, w = {w.data.data.item():.3f}, b = {b.data.data.item():.3f}\")\n",
+    "    \n",
+    "    adam_final_w = w.data.data.item()\n",
+    "    adam_final_b = b.data.data.item()\n",
+    "    \n",
+    "    print(f\"\\n📊 Results:\")\n",
+    "    print(f\"Target: w = 2.0, b = 1.0\")\n",
+    "    print(f\"SGD:    w = {sgd_final_w:.3f}, b = {sgd_final_b:.3f}\")\n",
+    "    print(f\"Adam:   w = {adam_final_w:.3f}, b = {adam_final_b:.3f}\")\n",
+    "    \n",
+    "    return sgd_final_w, sgd_final_b, adam_final_w, adam_final_b\n",
+    "    ### END SOLUTION"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a5330c4",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Complete Training Integration\n",
+    "\n",
+    "Let's test your complete training integration! This demonstrates optimizers working together in a realistic training scenario.\n",
+    "\n",
+    "**This is a unit test** - it tests the complete training workflow with optimizers in isolation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5aeda8ce",
+   "metadata": {
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-training-integration",
+     "locked": true,
+     "points": 25,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_training_integration_comprehensive():\n",
+    "    \"\"\"Test complete training integration with optimizers\"\"\"\n",
+    "    print(\"🔬 Unit Test: Complete Training Integration...\")\n",
+    "    \n",
+    "    # Test training with SGD and Adam\n",
+    "    try:\n",
+    "        sgd_w, sgd_b, adam_w, adam_b = train_simple_model()\n",
+    "        \n",
+    "        # Test SGD convergence\n",
+    "        assert abs(sgd_w - 2.0) < 0.1, f\"SGD should converge close to w=2.0, got {sgd_w}\"\n",
+    "        assert abs(sgd_b - 1.0) < 0.1, f\"SGD should converge close to b=1.0, got {sgd_b}\"\n",
+    "        print(\"✅ SGD convergence works\")\n",
+    "        \n",
+    "        # Test Adam convergence (may be different due to adaptive learning rates)\n",
+    "        assert abs(adam_w - 2.0) < 1.0, f\"Adam should converge reasonably close to w=2.0, got {adam_w}\"\n",
+    "        assert abs(adam_b - 1.0) < 1.0, f\"Adam should converge reasonably close to b=1.0, got {adam_b}\"\n",
+    "        print(\"✅ Adam convergence works\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Training integration failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test optimizer comparison\n",
+    "    try:\n",
+    "        # Both optimizers should achieve reasonable results\n",
+    "        sgd_error = (sgd_w - 2.0)**2 + (sgd_b - 1.0)**2\n",
+    "        adam_error = (adam_w - 2.0)**2 + (adam_b - 1.0)**2\n",
+    "        \n",
+    "        # Both should have low error (< 0.1)\n",
+    "        assert sgd_error < 0.1, f\"SGD error should be < 0.1, got {sgd_error}\"\n",
+    "        assert adam_error < 1.0, f\"Adam error should be < 1.0, got {adam_error}\"\n",
+    "        print(\"✅ Optimizer comparison works\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Optimizer comparison failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    # Test gradient flow\n",
+    "    try:\n",
+    "        # Create a simple test to verify gradients flow correctly\n",
+    "        w = Variable(1.0, requires_grad=True)\n",
+    "        b = Variable(0.0, requires_grad=True)\n",
+    "        \n",
+    "        # Set up simple gradients\n",
+    "        w.grad = Variable(0.1)\n",
+    "        b.grad = Variable(0.05)\n",
+    "        \n",
+    "        # Test SGD step\n",
+    "        sgd_optimizer = SGD([w, b], learning_rate=0.1)\n",
+    "        original_w = w.data.data.item()\n",
+    "        original_b = b.data.data.item()\n",
+    "        \n",
+    "        sgd_optimizer.step()\n",
+    "        \n",
+    "        # Check updates\n",
+    "        assert w.data.data.item() != original_w, \"SGD should update w\"\n",
+    "        assert b.data.data.item() != original_b, \"SGD should update b\"\n",
+    "        print(\"✅ Gradient flow works correctly\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Gradient flow failed: {e}\")\n",
+    "        raise\n",
+    "\n",
+    "    print(\"🎯 Training integration behavior:\")\n",
+    "    print(\"   Optimizers successfully minimize loss functions\")\n",
+    "    print(\"   SGD and Adam both converge to target values\")\n",
+    "    print(\"   Gradient computation and updates work correctly\")\n",
+    "    print(\"   Ready for real neural network training\")\n",
+    "    print(\"📈 Progress: Complete Training Integration ✓\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_training_integration_comprehensive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0464e8c",
+   "metadata": {},
+   "source": [
+    "\"\"\"\n",
+    "# 🎯 Module Summary: Optimization Mastery!\n",
+    "\n",
+    "Congratulations! You've successfully implemented the optimization algorithms that power all modern neural network training:\n",
+    "\n",
+    "## ✅ What You've Built\n",
+    "- **Gradient Descent**: The fundamental parameter update mechanism\n",
+    "- **SGD with Momentum**: Accelerated convergence with velocity accumulation\n",
+    "- **Adam Optimizer**: Adaptive learning rates with first and second moments\n",
+    "- **Learning Rate Scheduling**: Smart learning rate adjustment during training\n",
+    "- **Complete Training Integration**: End-to-end training workflow\n",
+    "\n",
+    "## ✅ Key Learning Outcomes\n",
+    "- **Understanding**: How optimizers use gradients to update parameters intelligently\n",
+    "- **Implementation**: Built SGD and Adam optimizers from mathematical foundations\n",
+    "- **Mathematical mastery**: Momentum, adaptive learning rates, bias correction\n",
+    "- **Systems integration**: Complete training loops with scheduling\n",
+    "- **Real-world application**: Modern deep learning training workflow\n",
+    "\n",
+    "## ✅ Mathematical Foundations Mastered\n",
+    "- **Gradient Descent**: θ = θ - α∇L(θ) for parameter updates\n",
+    "- **Momentum**: v_t = βv_{t-1} + ∇L(θ) for acceleration\n",
+    "- **Adam**: Adaptive learning rates with exponential moving averages\n",
+    "- **Learning Rate Scheduling**: Strategic learning rate adjustment\n",
+    "\n",
+    "## ✅ Professional Skills Developed\n",
+    "- **Algorithm implementation**: Translating mathematical formulas into code\n",
+    "- **State management**: Tracking optimizer buffers and statistics\n",
+    "- **Hyperparameter design**: Understanding the impact of learning rate, momentum, etc.\n",
+    "- **Training orchestration**: Complete training loop design\n",
+    "\n",
+    "## ✅ Ready for Advanced Applications\n",
+    "Your optimizers now enable:\n",
+    "- **Deep Neural Networks**: Effective training of complex architectures\n",
+    "- **Computer Vision**: Training CNNs, ResNets, Vision Transformers\n",
+    "- **Natural Language Processing**: Training transformers and language models\n",
+    "- **Any ML Model**: Gradient-based optimization for any differentiable system\n",
+    "\n",
+    "## 🔗 Connection to Real ML Systems\n",
+    "Your implementations mirror production systems:\n",
+    "- **PyTorch**: `torch.optim.SGD()`, `torch.optim.Adam()`, `torch.optim.lr_scheduler.StepLR()`\n",
+    "- **TensorFlow**: `tf.keras.optimizers.SGD()`, `tf.keras.optimizers.Adam()`\n",
+    "- **Industry Standard**: Every major ML framework uses these exact algorithms\n",
+    "\n",
+    "## 🎯 The Power of Intelligent Optimization\n",
+    "You've unlocked the algorithms that made modern AI possible:\n",
+    "- **Scalability**: Efficiently optimize millions of parameters\n",
+    "- **Adaptability**: Different learning rates for different parameters\n",
+    "- **Robustness**: Handle noisy gradients and ill-conditioned problems\n",
+    "- **Universality**: Work with any differentiable neural network\n",
+    "\n",
+    "## 🧠 Deep Learning Revolution\n",
+    "You now understand the optimization technology that powers:\n",
+    "- **ImageNet**: Training state-of-the-art computer vision models\n",
+    "- **Language Models**: Training GPT, BERT, and other transformers\n",
+    "- **Modern AI**: Every breakthrough relies on these optimization algorithms\n",
+    "- **Future Research**: Your understanding enables you to develop new optimizers\n",
+    "\n",
+    "## 🚀 What's Next\n",
+    "Your optimizers are the foundation for:\n",
+    "- **Training Module**: Complete training loops with loss functions and metrics\n",
+    "- **Advanced Optimizers**: RMSprop, AdaGrad, learning rate warm-up\n",
+    "- **Distributed Training**: Multi-GPU optimization strategies\n",
+    "- **Research**: Experimenting with novel optimization algorithms\n",
+    "\n",
+    "**Next Module**: Complete training systems that orchestrate your optimizers for real-world ML!\n",
+    "\n",
+    "You've built the intelligent algorithms that enable neural networks to learn. Now let's use them to train systems that can solve complex real-world problems!\n",
+    "\"\"\"\n",
+    "\n",
+    "Run inline tests when module is executed directly\n",
+    "if __name__ == \"__main__\":\n",
+    "    from tito.tools.testing import run_module_tests_auto\n",
+    "    \n",
+    "    # Automatically discover and run all tests in this module\n",
+    "    run_module_tests_auto(\"Optimizers\") "
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "main_language": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tinytorch/core/optimizers.py b/tinytorch/core/optimizers.py
new file mode 100644
index 00000000..bec2618a
--- /dev/null
+++ b/tinytorch/core/optimizers.py
@@ -0,0 +1,502 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/08_optimizers/optimizers_dev.ipynb.
+
+# %% auto 0
+__all__ = ['setup_import_paths', 'gradient_descent_step', 'SGD', 'Adam', 'StepLR']
+
+# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 1
+import math
+import numpy as np
+import sys
+import os
+from typing import List, Dict, Any, Optional, Union
+from collections import defaultdict
+
+# Helper function to set up import paths
+def setup_import_paths():
+    """Set up import paths for development modules."""
+    import sys
+    import os
+    
+    # Add module directories to path
+    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    tensor_dir = os.path.join(base_dir, '01_tensor')
+    autograd_dir = os.path.join(base_dir, '07_autograd')
+    
+    if tensor_dir not in sys.path:
+        sys.path.append(tensor_dir)
+    if autograd_dir not in sys.path:
+        sys.path.append(autograd_dir)
+
+# Import our existing components
+try:
+    from tinytorch.core.tensor import Tensor
+    from tinytorch.core.autograd import Variable
+except ImportError:
+    # For development, try local imports
+    try:
+        setup_import_paths()
+        from tensor_dev import Tensor
+        from autograd_dev import Variable
+    except ImportError:
+        # Create minimal fallback classes for testing
+        print("Warning: Using fallback classes for testing")
+        
+        class Tensor:
+            def __init__(self, data):
+                self.data = np.array(data)
+                self.shape = self.data.shape
+            
+            def __str__(self):
+                return f"Tensor({self.data})"
+        
+        class Variable:
+            def __init__(self, data, requires_grad=True):
+                if isinstance(data, (int, float)):
+                    self.data = Tensor([data])
+                else:
+                    self.data = Tensor(data)
+                self.requires_grad = requires_grad
+                self.grad = None
+            
+            def zero_grad(self):
+                self.grad = None
+            
+            def __str__(self):
+                return f"Variable({self.data.data})"
+
+# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 6
+def gradient_descent_step(parameter: Variable, learning_rate: float) -> None:
+    """
+    Perform one step of gradient descent on a parameter.
+    
+    Args:
+        parameter: Variable with gradient information
+        learning_rate: How much to update parameter
+    
+    TODO: Implement basic gradient descent parameter update.
+    
+    STEP-BY-STEP IMPLEMENTATION:
+    1. Check if parameter has a gradient
+    2. Get current parameter value and gradient
+    3. Update parameter: new_value = old_value - learning_rate * gradient
+    4. Update parameter data with new value
+    5. Handle edge cases (no gradient, invalid values)
+    
+    EXAMPLE USAGE:
+    ```python
+    # Parameter with gradient
+    w = Variable(2.0, requires_grad=True)
+    w.grad = Variable(0.5)  # Gradient from loss
+    
+    # Update parameter
+    gradient_descent_step(w, learning_rate=0.1)
+    # w.data now contains: 2.0 - 0.1 * 0.5 = 1.95
+    ```
+    
+    IMPLEMENTATION HINTS:
+    - Check if parameter.grad is not None
+    - Use parameter.grad.data.data to get gradient value
+    - Update parameter.data with new Tensor
+    - Don't modify gradient (it's used for logging)
+    
+    LEARNING CONNECTIONS:
+    - This is the foundation of all neural network training
+    - PyTorch's optimizer.step() does exactly this
+    - The learning rate determines convergence speed
+    """
+    ### BEGIN SOLUTION
+    if parameter.grad is not None:
+        # Get current parameter value and gradient
+        current_value = parameter.data.data
+        gradient_value = parameter.grad.data.data
+        
+        # Update parameter: new_value = old_value - learning_rate * gradient
+        new_value = current_value - learning_rate * gradient_value
+        
+        # Update parameter data
+        parameter.data = Tensor(new_value)
+    ### END SOLUTION
+
+# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 10
+class SGD:
+    """
+    SGD Optimizer with Momentum
+    
+    Implements stochastic gradient descent with momentum:
+    v_t = momentum * v_{t-1} + gradient
+    parameter = parameter - learning_rate * v_t
+    """
+    
+    def __init__(self, parameters: List[Variable], learning_rate: float = 0.01, 
+                 momentum: float = 0.0, weight_decay: float = 0.0):
+        """
+        Initialize SGD optimizer.
+        
+        Args:
+            parameters: List of Variables to optimize
+            learning_rate: Learning rate (default: 0.01)
+            momentum: Momentum coefficient (default: 0.0)
+            weight_decay: L2 regularization coefficient (default: 0.0)
+        
+        TODO: Implement SGD optimizer initialization.
+        
+        APPROACH:
+        1. Store parameters and hyperparameters
+        2. Initialize momentum buffers for each parameter
+        3. Set up state tracking for optimization
+        4. Prepare for step() and zero_grad() methods
+        
+        EXAMPLE:
+        ```python
+        # Create optimizer
+        optimizer = SGD([w1, w2, b1, b2], learning_rate=0.01, momentum=0.9)
+        
+        # In training loop:
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        ```
+        
+        HINTS:
+        - Store parameters as a list
+        - Initialize momentum buffers as empty dict
+        - Use parameter id() as key for momentum tracking
+        - Momentum buffers will be created lazily in step()
+        """
+        ### BEGIN SOLUTION
+        self.parameters = parameters
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        
+        # Initialize momentum buffers (created lazily)
+        self.momentum_buffers = {}
+        
+        # Track optimization steps
+        self.step_count = 0
+        ### END SOLUTION
+    
+    def step(self) -> None:
+        """
+        Perform one optimization step.
+        
+        TODO: Implement SGD parameter update with momentum.
+        
+        APPROACH:
+        1. Iterate through all parameters
+        2. For each parameter with gradient:
+           a. Get current gradient
+           b. Apply weight decay if specified
+           c. Update momentum buffer (or create if first time)
+           d. Update parameter using momentum
+        3. Increment step count
+        
+        MATHEMATICAL FORMULATION:
+        - If weight_decay > 0: gradient = gradient + weight_decay * parameter
+        - momentum_buffer = momentum * momentum_buffer + gradient
+        - parameter = parameter - learning_rate * momentum_buffer
+        
+        IMPLEMENTATION HINTS:
+        - Use id(param) as key for momentum buffers
+        - Initialize buffer with zeros if not exists
+        - Handle case where momentum = 0 (no momentum)
+        - Update parameter.data with new Tensor
+        """
+        ### BEGIN SOLUTION
+        for param in self.parameters:
+            if param.grad is not None:
+                # Get gradient
+                gradient = param.grad.data.data
+                
+                # Apply weight decay (L2 regularization)
+                if self.weight_decay > 0:
+                    gradient = gradient + self.weight_decay * param.data.data
+                
+                # Get or create momentum buffer
+                param_id = id(param)
+                if param_id not in self.momentum_buffers:
+                    self.momentum_buffers[param_id] = np.zeros_like(param.data.data)
+                
+                # Update momentum buffer
+                self.momentum_buffers[param_id] = (
+                    self.momentum * self.momentum_buffers[param_id] + gradient
+                )
+                
+                # Update parameter
+                param.data = Tensor(
+                    param.data.data - self.learning_rate * self.momentum_buffers[param_id]
+                )
+        
+        self.step_count += 1
+        ### END SOLUTION
+    
+    def zero_grad(self) -> None:
+        """
+        Zero out gradients for all parameters.
+        
+        TODO: Implement gradient zeroing.
+        
+        APPROACH:
+        1. Iterate through all parameters
+        2. Set gradient to None for each parameter
+        3. This prepares for next backward pass
+        
+        IMPLEMENTATION HINTS:
+        - Simply set param.grad = None
+        - This is called before loss.backward()
+        - Essential for proper gradient accumulation
+        """
+        ### BEGIN SOLUTION
+        for param in self.parameters:
+            param.grad = None
+        ### END SOLUTION
+
+# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 14
+class Adam:
+    """
+    Adam Optimizer
+    
+    Implements Adam algorithm with adaptive learning rates:
+    - First moment: exponential moving average of gradients
+    - Second moment: exponential moving average of squared gradients
+    - Bias correction: accounts for initialization bias
+    - Adaptive updates: different learning rate per parameter
+    """
+    
+    def __init__(self, parameters: List[Variable], learning_rate: float = 0.001,
+                 beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-8,
+                 weight_decay: float = 0.0):
+        """
+        Initialize Adam optimizer.
+        
+        Args:
+            parameters: List of Variables to optimize
+            learning_rate: Learning rate (default: 0.001)
+            beta1: Exponential decay rate for first moment (default: 0.9)
+            beta2: Exponential decay rate for second moment (default: 0.999)
+            epsilon: Small constant for numerical stability (default: 1e-8)
+            weight_decay: L2 regularization coefficient (default: 0.0)
+        
+        TODO: Implement Adam optimizer initialization.
+        
+        APPROACH:
+        1. Store parameters and hyperparameters
+        2. Initialize first moment buffers (m_t)
+        3. Initialize second moment buffers (v_t)
+        4. Set up step counter for bias correction
+        
+        EXAMPLE:
+        ```python
+        # Create Adam optimizer
+        optimizer = Adam([w1, w2, b1, b2], learning_rate=0.001)
+        
+        # In training loop:
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        ```
+        
+        HINTS:
+        - Store all hyperparameters
+        - Initialize moment buffers as empty dicts
+        - Use parameter id() as key for tracking
+        - Buffers will be created lazily in step()
+        """
+        ### BEGIN SOLUTION
+        self.parameters = parameters
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.weight_decay = weight_decay
+        
+        # Initialize moment buffers (created lazily)
+        self.first_moment = {}   # m_t
+        self.second_moment = {}  # v_t
+        
+        # Track optimization steps for bias correction
+        self.step_count = 0
+        ### END SOLUTION
+    
+    def step(self) -> None:
+        """
+        Perform one optimization step using Adam algorithm.
+        
+        TODO: Implement Adam parameter update.
+        
+        APPROACH:
+        1. Increment step count
+        2. For each parameter with gradient:
+           a. Get current gradient
+           b. Apply weight decay if specified
+           c. Update first moment (momentum)
+           d. Update second moment (variance)
+           e. Apply bias correction
+           f. Update parameter with adaptive learning rate
+        
+        MATHEMATICAL FORMULATION:
+        - m_t = beta1 * m_{t-1} + (1 - beta1) * gradient
+        - v_t = beta2 * v_{t-1} + (1 - beta2) * gradient^2
+        - m_hat = m_t / (1 - beta1^t)
+        - v_hat = v_t / (1 - beta2^t)
+        - parameter = parameter - learning_rate * m_hat / (sqrt(v_hat) + epsilon)
+        
+        IMPLEMENTATION HINTS:
+        - Use id(param) as key for moment buffers
+        - Initialize buffers with zeros if not exists
+        - Use np.sqrt() for square root
+        - Handle numerical stability with epsilon
+        """
+        ### BEGIN SOLUTION
+        self.step_count += 1
+        
+        for param in self.parameters:
+            if param.grad is not None:
+                # Get gradient
+                gradient = param.grad.data.data
+                
+                # Apply weight decay (L2 regularization)
+                if self.weight_decay > 0:
+                    gradient = gradient + self.weight_decay * param.data.data
+                
+                # Get or create moment buffers
+                param_id = id(param)
+                if param_id not in self.first_moment:
+                    self.first_moment[param_id] = np.zeros_like(param.data.data)
+                    self.second_moment[param_id] = np.zeros_like(param.data.data)
+                
+                # Update first moment (momentum)
+                self.first_moment[param_id] = (
+                    self.beta1 * self.first_moment[param_id] + 
+                    (1 - self.beta1) * gradient
+                )
+                
+                # Update second moment (variance)
+                self.second_moment[param_id] = (
+                    self.beta2 * self.second_moment[param_id] + 
+                    (1 - self.beta2) * gradient * gradient
+                )
+                
+                # Bias correction
+                first_moment_corrected = (
+                    self.first_moment[param_id] / (1 - self.beta1 ** self.step_count)
+                )
+                second_moment_corrected = (
+                    self.second_moment[param_id] / (1 - self.beta2 ** self.step_count)
+                )
+                
+                # Update parameter with adaptive learning rate
+                param.data = Tensor(
+                    param.data.data - self.learning_rate * first_moment_corrected / 
+                    (np.sqrt(second_moment_corrected) + self.epsilon)
+                )
+        ### END SOLUTION
+    
+    def zero_grad(self) -> None:
+        """
+        Zero out gradients for all parameters.
+        
+        TODO: Implement gradient zeroing (same as SGD).
+        
+        IMPLEMENTATION HINTS:
+        - Set param.grad = None for all parameters
+        - This is identical to SGD implementation
+        """
+        ### BEGIN SOLUTION
+        for param in self.parameters:
+            param.grad = None
+        ### END SOLUTION
+
+# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 19
+class StepLR:
+    """
+    Step Learning Rate Scheduler
+    
+    Decays learning rate by gamma every step_size epochs:
+    learning_rate = initial_lr * (gamma ^ (epoch // step_size))
+    """
+    
+    def __init__(self, optimizer: Union[SGD, Adam], step_size: int, gamma: float = 0.1):
+        """
+        Initialize step learning rate scheduler.
+        
+        Args:
+            optimizer: Optimizer to schedule
+            step_size: Number of epochs between decreases
+            gamma: Multiplicative factor for learning rate decay
+        
+        TODO: Implement learning rate scheduler initialization.
+        
+        APPROACH:
+        1. Store optimizer reference
+        2. Store scheduling parameters
+        3. Save initial learning rate
+        4. Initialize step counter
+        
+        EXAMPLE:
+        ```python
+        optimizer = SGD([w1, w2], learning_rate=0.1)
+        scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
+        
+        # In training loop:
+        for epoch in range(100):
+            train_one_epoch()
+            scheduler.step()  # Update learning rate
+        ```
+        
+        HINTS:
+        - Store optimizer reference
+        - Save initial learning rate from optimizer
+        - Initialize step counter to 0
+        - gamma is the decay factor (0.1 = 10x reduction)
+        """
+        ### BEGIN SOLUTION
+        self.optimizer = optimizer
+        self.step_size = step_size
+        self.gamma = gamma
+        self.initial_lr = optimizer.learning_rate
+        self.step_count = 0
+        ### END SOLUTION
+    
+    def step(self) -> None:
+        """
+        Update learning rate based on current step.
+        
+        TODO: Implement learning rate update.
+        
+        APPROACH:
+        1. Increment step counter
+        2. Calculate new learning rate using step decay formula
+        3. Update optimizer's learning rate
+        
+        MATHEMATICAL FORMULATION:
+        new_lr = initial_lr * (gamma ^ ((step_count - 1) // step_size))
+        
+        IMPLEMENTATION HINTS:
+        - Use // for integer division
+        - Use ** for exponentiation
+        - Update optimizer.learning_rate directly
+        """
+        ### BEGIN SOLUTION
+        self.step_count += 1
+        
+        # Calculate new learning rate
+        decay_factor = self.gamma ** ((self.step_count - 1) // self.step_size)
+        new_lr = self.initial_lr * decay_factor
+        
+        # Update optimizer's learning rate
+        self.optimizer.learning_rate = new_lr
+        ### END SOLUTION
+    
+    def get_lr(self) -> float:
+        """
+        Get current learning rate.
+        
+        TODO: Return current learning rate.
+        
+        IMPLEMENTATION HINTS:
+        - Return optimizer.learning_rate
+        """
+        ### BEGIN SOLUTION
+        return self.optimizer.learning_rate
+        ### END SOLUTION
diff --git a/tinytorch/core/training.py b/tinytorch/core/training.py
new file mode 100644
index 00000000..1d9a1bcb
--- /dev/null
+++ b/tinytorch/core/training.py
@@ -0,0 +1,687 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/09_training/training_dev.ipynb.
+
+# %% auto 0
+__all__ = ['setup_import_paths', 'MeanSquaredError', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss', 'Accuracy', 'Trainer']
+
+# %% ../../modules/source/09_training/training_dev.ipynb 1
+import numpy as np
+import sys
+import os
+import pickle
+import json
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Union, Callable, Tuple
+from collections import defaultdict
+import time
+
+# Helper function to set up import paths
+def setup_import_paths():
+    """Set up import paths for development modules."""
+    import sys
+    import os
+    
+    # Add module directories to path
+    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    module_dirs = [
+        '01_tensor', '02_activations', '03_layers', '04_networks', 
+        '05_cnn', '06_dataloader', '07_autograd', '08_optimizers'
+    ]
+    
+    for module_dir in module_dirs:
+        sys.path.append(os.path.join(base_dir, module_dir))
+
+# Set up paths
+setup_import_paths()
+
+# Import all the building blocks we need
+try:
+    from tinytorch.core.tensor import Tensor
+    from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax
+    from tinytorch.core.layers import Dense
+    from tinytorch.core.networks import Sequential, create_mlp
+    from tinytorch.core.cnn import Conv2D, flatten
+    from tinytorch.core.dataloader import Dataset, DataLoader
+    from tinytorch.core.autograd import Variable
+    from tinytorch.core.optimizers import SGD, Adam, StepLR
+except ImportError:
+    # For development, create mock classes or import from local modules
+    try:
+        from tensor_dev import Tensor
+        from activations_dev import ReLU, Sigmoid, Tanh, Softmax
+        from layers_dev import Dense
+        from networks_dev import Sequential, create_mlp
+        from cnn_dev import Conv2D, flatten
+        from dataloader_dev import Dataset, DataLoader
+        from autograd_dev import Variable
+        from optimizers_dev import SGD, Adam, StepLR
+    except ImportError:
+        # Create minimal mock classes for development
+        class Tensor:
+            def __init__(self, data):
+                self.data = np.array(data)
+            def __str__(self):
+                return f"Tensor({self.data})"
+        
+        class Variable:
+            def __init__(self, data, requires_grad=True):
+                self.data = Tensor(data)
+                self.requires_grad = requires_grad
+                self.grad = None
+            
+            def zero_grad(self):
+                self.grad = None
+            
+            def backward(self):
+                if self.requires_grad:
+                    self.grad = Variable(1.0, requires_grad=False)
+            
+            def __str__(self):
+                return f"Variable({self.data})"
+        
+        class SGD:
+            def __init__(self, parameters, learning_rate=0.01):
+                self.parameters = parameters
+                self.learning_rate = learning_rate
+            
+            def zero_grad(self):
+                for param in self.parameters:
+                    if hasattr(param, 'zero_grad'):
+                        param.zero_grad()
+            
+            def step(self):
+                pass
+        
+        class Sequential:
+            def __init__(self, layers=None):
+                self.layers = layers or []
+            
+            def __call__(self, x):
+                for layer in self.layers:
+                    x = layer(x)
+                return x
+        
+        class DataLoader:
+            def __init__(self, dataset, batch_size=32, shuffle=True):
+                self.dataset = dataset
+                self.batch_size = batch_size
+                self.shuffle = shuffle
+            
+            def __iter__(self):
+                return iter([(Tensor([1, 2, 3]), Tensor([0]))])
+
+# %% ../../modules/source/09_training/training_dev.ipynb 4
+class MeanSquaredError:
+    """
+    Mean Squared Error Loss for Regression
+    
+    Measures the average squared difference between predictions and targets.
+    MSE = (1/n) * Σ(y_pred - y_true)²
+    """
+    
+    def __init__(self):
+        """Initialize MSE loss function."""
+        pass
+    
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
+        """
+        Compute MSE loss between predictions and targets.
+        
+        Args:
+            y_pred: Model predictions (shape: [batch_size, ...])
+            y_true: True targets (shape: [batch_size, ...])
+            
+        Returns:
+            Scalar loss value
+            
+        TODO: Implement Mean Squared Error loss computation.
+        
+        APPROACH:
+        1. Compute difference: diff = y_pred - y_true
+        2. Square the differences: squared_diff = diff²
+        3. Take mean over all elements: mean(squared_diff)
+        4. Return as scalar Tensor
+        
+        EXAMPLE:
+        y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]])
+        y_true = Tensor([[1.5, 2.5], [2.5, 3.5]])
+        loss = mse_loss(y_pred, y_true)
+        # Should return: mean([(1.0-1.5)², (2.0-2.5)², (3.0-2.5)², (4.0-3.5)²])
+        #                = mean([0.25, 0.25, 0.25, 0.25]) = 0.25
+        
+        HINTS:
+        - Use tensor subtraction: y_pred - y_true
+        - Use element-wise multiplication for squaring: diff * diff
+        - Use np.mean() to get the average
+        - Return Tensor(scalar_value)
+        """
+        ### BEGIN SOLUTION
+        # Compute difference
+        diff = y_pred - y_true
+        
+        # Square the differences
+        squared_diff = diff * diff
+        
+        # Take mean over all elements
+        mean_loss = np.mean(squared_diff.data)
+        
+        return Tensor(mean_loss)
+        ### END SOLUTION
+    
+    def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
+        """Alternative interface for forward pass."""
+        return self.__call__(y_pred, y_true)
+
+# %% ../../modules/source/09_training/training_dev.ipynb 7
+class CrossEntropyLoss:
+    """
+    Cross-Entropy Loss for Multi-Class Classification
+    
+    Measures the difference between predicted probability distribution and true labels.
+    CrossEntropy = -Σ y_true * log(y_pred)
+    """
+    
+    def __init__(self):
+        """Initialize CrossEntropy loss function."""
+        pass
+    
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
+        """
+        Compute CrossEntropy loss between predictions and targets.
+        
+        Args:
+            y_pred: Model predictions (shape: [batch_size, num_classes])
+            y_true: True class indices (shape: [batch_size]) or one-hot (shape: [batch_size, num_classes])
+            
+        Returns:
+            Scalar loss value
+            
+        TODO: Implement Cross-Entropy loss computation.
+        
+        APPROACH:
+        1. Handle both class indices and one-hot encoded labels
+        2. Apply softmax to predictions for probability distribution
+        3. Compute log probabilities: log(softmax(y_pred))
+        4. Calculate cross-entropy: -mean(y_true * log_probs)
+        5. Return scalar loss
+        
+        EXAMPLE:
+        y_pred = Tensor([[2.0, 1.0, 0.1], [0.5, 2.1, 0.9]])  # Raw logits
+        y_true = Tensor([0, 1])  # Class indices
+        loss = crossentropy_loss(y_pred, y_true)
+        # Should apply softmax then compute -log(prob_of_correct_class)
+        
+        HINTS:
+        - Use softmax: exp(x) / sum(exp(x)) for probability distribution
+        - Add small epsilon (1e-15) to avoid log(0)
+        - Handle both class indices and one-hot encoding
+        - Use np.log for logarithm computation
+        """
+        ### BEGIN SOLUTION
+        # Handle both 1D and 2D prediction arrays
+        if y_pred.data.ndim == 1:
+            # Reshape 1D to 2D for consistency (single sample)
+            y_pred_2d = y_pred.data.reshape(1, -1)
+        else:
+            y_pred_2d = y_pred.data
+            
+        # Apply softmax to get probability distribution
+        exp_pred = np.exp(y_pred_2d - np.max(y_pred_2d, axis=1, keepdims=True))
+        softmax_pred = exp_pred / np.sum(exp_pred, axis=1, keepdims=True)
+        
+        # Add small epsilon to avoid log(0)
+        epsilon = 1e-15
+        softmax_pred = np.clip(softmax_pred, epsilon, 1.0 - epsilon)
+        
+        # Handle class indices vs one-hot encoding
+        if len(y_true.data.shape) == 1:
+            # y_true contains class indices
+            batch_size = y_true.data.shape[0]
+            log_probs = np.log(softmax_pred[np.arange(batch_size), y_true.data.astype(int)])
+            loss = -np.mean(log_probs)
+        else:
+            # y_true is one-hot encoded
+            log_probs = np.log(softmax_pred)
+            loss = -np.mean(np.sum(y_true.data * log_probs, axis=1))
+        
+        return Tensor(loss)
+        ### END SOLUTION
+    
+    def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
+        """Alternative interface for forward pass."""
+        return self.__call__(y_pred, y_true)
+
+# %% ../../modules/source/09_training/training_dev.ipynb 10
+class BinaryCrossEntropyLoss:
+    """
+    Binary Cross-Entropy Loss for Binary Classification
+    
+    Measures the difference between predicted probabilities and binary labels.
+    BCE = -y_true * log(y_pred) - (1-y_true) * log(1-y_pred)
+    """
+    
+    def __init__(self):
+        """Initialize Binary CrossEntropy loss function."""
+        pass
+    
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
+        """
+        Compute Binary CrossEntropy loss between predictions and targets.
+        
+        Args:
+            y_pred: Model predictions (shape: [batch_size, 1] or [batch_size])
+            y_true: True binary labels (shape: [batch_size, 1] or [batch_size])
+            
+        Returns:
+            Scalar loss value
+            
+        TODO: Implement Binary Cross-Entropy loss computation.
+        
+        APPROACH:
+        1. Apply sigmoid to predictions for probability values
+        2. Clip probabilities to avoid log(0) and log(1)
+        3. Compute: -y_true * log(y_pred) - (1-y_true) * log(1-y_pred)
+        4. Take mean over batch
+        5. Return scalar loss
+        
+        EXAMPLE:
+        y_pred = Tensor([[2.0], [0.0], [-1.0]])  # Raw logits
+        y_true = Tensor([[1.0], [1.0], [0.0]])   # Binary labels
+        loss = bce_loss(y_pred, y_true)
+        # Should apply sigmoid then compute binary cross-entropy
+        
+        HINTS:
+        - Use sigmoid: 1 / (1 + exp(-x))
+        - Clip probabilities: np.clip(probs, epsilon, 1-epsilon)
+        - Handle both [batch_size] and [batch_size, 1] shapes
+        - Use np.log for logarithm computation
+        """
+        ### BEGIN SOLUTION
+        # Use numerically stable implementation directly from logits
+        # This avoids computing sigmoid and log separately
+        logits = y_pred.data.flatten()
+        labels = y_true.data.flatten()
+        
+        # Numerically stable binary cross-entropy from logits
+        # Uses the identity: log(1 + exp(x)) = max(x, 0) + log(1 + exp(-abs(x)))
+        def stable_bce_with_logits(logits, labels):
+            # For each sample: -[y*log(sigmoid(x)) + (1-y)*log(1-sigmoid(x))]
+            # Which equals: -[y*log_sigmoid(x) + (1-y)*log_sigmoid(-x)]
+            # Where log_sigmoid(x) = x - log(1 + exp(x)) = x - softplus(x)
+            
+            # Compute log(sigmoid(x)) = x - log(1 + exp(x))
+            # Use numerical stability: log(1 + exp(x)) = max(0, x) + log(1 + exp(-abs(x)))
+            def log_sigmoid(x):
+                return x - np.maximum(0, x) - np.log(1 + np.exp(-np.abs(x)))
+            
+            # Compute log(1 - sigmoid(x)) = -x - log(1 + exp(-x))
+            def log_one_minus_sigmoid(x):
+                return -x - np.maximum(0, -x) - np.log(1 + np.exp(-np.abs(x)))
+            
+            # Binary cross-entropy: -[y*log_sigmoid(x) + (1-y)*log_sigmoid(-x)]
+            loss = -(labels * log_sigmoid(logits) + (1 - labels) * log_one_minus_sigmoid(logits))
+            return loss
+        
+        # Compute loss for each sample
+        losses = stable_bce_with_logits(logits, labels)
+        
+        # Take mean over batch
+        mean_loss = np.mean(losses)
+        
+        return Tensor(mean_loss)
+        ### END SOLUTION
+    
+    def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
+        """Alternative interface for forward pass."""
+        return self.__call__(y_pred, y_true)
+
+# %% ../../modules/source/09_training/training_dev.ipynb 14
+class Accuracy:
+    """
+    Accuracy Metric for Classification
+    
+    Computes the fraction of correct predictions.
+    Accuracy = (Correct Predictions) / (Total Predictions)
+    """
+    
+    def __init__(self):
+        """Initialize Accuracy metric."""
+        pass
+    
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
+        """
+        Compute accuracy between predictions and targets.
+        
+        Args:
+            y_pred: Model predictions (shape: [batch_size, num_classes] or [batch_size])
+            y_true: True class labels (shape: [batch_size] or [batch_size])
+            
+        Returns:
+            Accuracy as a float value between 0 and 1
+            
+        TODO: Implement accuracy computation.
+        
+        APPROACH:
+        1. Convert predictions to class indices (argmax for multi-class)
+        2. Convert true labels to class indices if needed
+        3. Count correct predictions
+        4. Divide by total predictions
+        5. Return as float
+        
+        EXAMPLE:
+        y_pred = Tensor([[0.9, 0.1], [0.2, 0.8], [0.6, 0.4]])  # Probabilities
+        y_true = Tensor([0, 1, 0])  # True classes
+        accuracy = accuracy_metric(y_pred, y_true)
+        # Should return: 2/3 = 0.667 (first and second predictions correct)
+        
+        HINTS:
+        - Use np.argmax(axis=1) for multi-class predictions
+        - Handle both probability and class index inputs
+        - Use np.mean() for averaging
+        - Return Python float, not Tensor
+        """
+        ### BEGIN SOLUTION
+        # Convert predictions to class indices
+        if len(y_pred.data.shape) > 1 and y_pred.data.shape[1] > 1:
+            # Multi-class: use argmax
+            pred_classes = np.argmax(y_pred.data, axis=1)
+        else:
+            # Binary classification: threshold at 0.5
+            pred_classes = (y_pred.data.flatten() > 0.5).astype(int)
+        
+        # Convert true labels to class indices if needed
+        if len(y_true.data.shape) > 1 and y_true.data.shape[1] > 1:
+            # One-hot encoded
+            true_classes = np.argmax(y_true.data, axis=1)
+        else:
+            # Already class indices
+            true_classes = y_true.data.flatten().astype(int)
+        
+        # Compute accuracy
+        correct = np.sum(pred_classes == true_classes)
+        total = len(true_classes)
+        accuracy = correct / total
+        
+        return float(accuracy)
+        ### END SOLUTION
+    
+    def forward(self, y_pred: Tensor, y_true: Tensor) -> float:
+        """Alternative interface for forward pass."""
+        return self.__call__(y_pred, y_true)
+
+# %% ../../modules/source/09_training/training_dev.ipynb 18
+class Trainer:
+    """
+    Training Loop Orchestrator
+    
+    Coordinates model training with loss functions, optimizers, and metrics.
+    """
+    
+    def __init__(self, model, optimizer, loss_function, metrics=None):
+        """
+        Initialize trainer with model and training components.
+        
+        Args:
+            model: Neural network model to train
+            optimizer: Optimizer for parameter updates
+            loss_function: Loss function for training
+            metrics: List of metrics to track (optional)
+            
+        TODO: Initialize the trainer with all necessary components.
+        
+        APPROACH:
+        1. Store model, optimizer, loss function, and metrics
+        2. Initialize history tracking for losses and metrics
+        3. Set up training state (epoch, step counters)
+        4. Prepare for training and validation loops
+        
+        EXAMPLE:
+        model = Sequential([Dense(10, 5), ReLU(), Dense(5, 2)])
+        optimizer = Adam(model.parameters, learning_rate=0.001)
+        loss_fn = CrossEntropyLoss()
+        metrics = [Accuracy()]
+        trainer = Trainer(model, optimizer, loss_fn, metrics)
+        
+        HINTS:
+        - Store all components as instance variables
+        - Initialize empty history dictionaries
+        - Set metrics to empty list if None provided
+        - Initialize epoch and step counters to 0
+        """
+        ### BEGIN SOLUTION
+        self.model = model
+        self.optimizer = optimizer
+        self.loss_function = loss_function
+        self.metrics = metrics or []
+        
+        # Training history
+        self.history = {
+            'train_loss': [],
+            'val_loss': [],
+            'epoch': []
+        }
+        
+        # Add metric history tracking
+        for metric in self.metrics:
+            metric_name = metric.__class__.__name__.lower()
+            self.history[f'train_{metric_name}'] = []
+            self.history[f'val_{metric_name}'] = []
+        
+        # Training state
+        self.current_epoch = 0
+        self.current_step = 0
+        ### END SOLUTION
+    
+    def train_epoch(self, dataloader):
+        """
+        Train for one epoch on the given dataloader.
+        
+        Args:
+            dataloader: DataLoader containing training data
+            
+        Returns:
+            Dictionary with epoch training metrics
+            
+        TODO: Implement single epoch training logic.
+        
+        APPROACH:
+        1. Initialize epoch metrics tracking
+        2. Iterate through batches in dataloader
+        3. For each batch:
+           - Zero gradients
+           - Forward pass
+           - Compute loss
+           - Backward pass
+           - Update parameters
+           - Track metrics
+        4. Return averaged metrics for the epoch
+        
+        HINTS:
+        - Use optimizer.zero_grad() before each batch
+        - Call loss.backward() for gradient computation
+        - Use optimizer.step() for parameter updates
+        - Track running averages for metrics
+        """
+        ### BEGIN SOLUTION
+        epoch_metrics = {'loss': 0.0}
+        
+        # Initialize metric tracking
+        for metric in self.metrics:
+            metric_name = metric.__class__.__name__.lower()
+            epoch_metrics[metric_name] = 0.0
+        
+        batch_count = 0
+        
+        for batch_x, batch_y in dataloader:
+            # Zero gradients
+            self.optimizer.zero_grad()
+            
+            # Forward pass
+            predictions = self.model(batch_x)
+            
+            # Compute loss
+            loss = self.loss_function(predictions, batch_y)
+            
+            # Backward pass (simplified - in real implementation would use autograd)
+            # loss.backward()
+            
+            # Update parameters
+            self.optimizer.step()
+            
+            # Track metrics
+            epoch_metrics['loss'] += loss.data
+            
+            for metric in self.metrics:
+                metric_name = metric.__class__.__name__.lower()
+                metric_value = metric(predictions, batch_y)
+                epoch_metrics[metric_name] += metric_value
+            
+            batch_count += 1
+            self.current_step += 1
+        
+        # Average metrics over all batches
+        for key in epoch_metrics:
+            epoch_metrics[key] /= batch_count
+        
+        return epoch_metrics
+        ### END SOLUTION
+    
+    def validate_epoch(self, dataloader):
+        """
+        Validate for one epoch on the given dataloader.
+        
+        Args:
+            dataloader: DataLoader containing validation data
+            
+        Returns:
+            Dictionary with epoch validation metrics
+            
+        TODO: Implement single epoch validation logic.
+        
+        APPROACH:
+        1. Initialize epoch metrics tracking
+        2. Iterate through batches in dataloader
+        3. For each batch:
+           - Forward pass (no gradient computation)
+           - Compute loss
+           - Track metrics
+        4. Return averaged metrics for the epoch
+        
+        HINTS:
+        - No gradient computation needed for validation
+        - No parameter updates during validation
+        - Similar to train_epoch but simpler
+        """
+        ### BEGIN SOLUTION
+        epoch_metrics = {'loss': 0.0}
+        
+        # Initialize metric tracking
+        for metric in self.metrics:
+            metric_name = metric.__class__.__name__.lower()
+            epoch_metrics[metric_name] = 0.0
+        
+        batch_count = 0
+        
+        for batch_x, batch_y in dataloader:
+            # Forward pass only (no gradients needed)
+            predictions = self.model(batch_x)
+            
+            # Compute loss
+            loss = self.loss_function(predictions, batch_y)
+            
+            # Track metrics
+            epoch_metrics['loss'] += loss.data
+            
+            for metric in self.metrics:
+                metric_name = metric.__class__.__name__.lower()
+                metric_value = metric(predictions, batch_y)
+                epoch_metrics[metric_name] += metric_value
+            
+            batch_count += 1
+        
+        # Average metrics over all batches
+        for key in epoch_metrics:
+            epoch_metrics[key] /= batch_count
+        
+        return epoch_metrics
+        ### END SOLUTION
+    
+    def fit(self, train_dataloader, val_dataloader=None, epochs=10, verbose=True):
+        """
+        Train the model for specified number of epochs.
+        
+        Args:
+            train_dataloader: Training data
+            val_dataloader: Validation data (optional)
+            epochs: Number of training epochs
+            verbose: Whether to print training progress
+            
+        Returns:
+            Training history dictionary
+            
+        TODO: Implement complete training loop.
+        
+        APPROACH:
+        1. Loop through epochs
+        2. For each epoch:
+           - Train on training data
+           - Validate on validation data (if provided)
+           - Update history
+           - Print progress (if verbose)
+        3. Return complete training history
+        
+        HINTS:
+        - Use train_epoch() and validate_epoch() methods
+        - Update self.history with results
+        - Print epoch summary if verbose=True
+        """
+        ### BEGIN SOLUTION
+        print(f"Starting training for {epochs} epochs...")
+        
+        for epoch in range(epochs):
+            self.current_epoch = epoch
+            
+            # Training phase
+            train_metrics = self.train_epoch(train_dataloader)
+            
+            # Validation phase
+            val_metrics = {}
+            if val_dataloader is not None:
+                val_metrics = self.validate_epoch(val_dataloader)
+            
+            # Update history
+            self.history['epoch'].append(epoch)
+            self.history['train_loss'].append(train_metrics['loss'])
+            
+            if val_dataloader is not None:
+                self.history['val_loss'].append(val_metrics['loss'])
+            
+            # Update metric history
+            for metric in self.metrics:
+                metric_name = metric.__class__.__name__.lower()
+                self.history[f'train_{metric_name}'].append(train_metrics[metric_name])
+                if val_dataloader is not None:
+                    self.history[f'val_{metric_name}'].append(val_metrics[metric_name])
+            
+            # Print progress
+            if verbose:
+                train_loss = train_metrics['loss']
+                print(f"Epoch {epoch+1}/{epochs} - train_loss: {train_loss:.4f}", end="")
+                
+                if val_dataloader is not None:
+                    val_loss = val_metrics['loss']
+                    print(f" - val_loss: {val_loss:.4f}", end="")
+                
+                for metric in self.metrics:
+                    metric_name = metric.__class__.__name__.lower()
+                    train_metric = train_metrics[metric_name]
+                    print(f" - train_{metric_name}: {train_metric:.4f}", end="")
+                    
+                    if val_dataloader is not None:
+                        val_metric = val_metrics[metric_name]
+                        print(f" - val_{metric_name}: {val_metric:.4f}", end="")
+                
+                print()  # New line
+        
+        print("Training completed!")
+        return self.history
+        ### END SOLUTION