diff --git a/modules/source/09_training/training_dev.ipynb b/modules/source/09_training/training_dev.ipynb
new file mode 100644
index 00000000..b03b2dec
--- /dev/null
+++ b/modules/source/09_training/training_dev.ipynb
@@ -0,0 +1,1591 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4865e371",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Module 9: Training - Complete Neural Network Training Pipeline\n",
+    "\n",
+    "Welcome to the Training module! This is where we bring everything together to train neural networks on real data.\n",
+    "\n",
+    "## Learning Goals\n",
+    "- Understand loss functions and how they measure model performance\n",
+    "- Implement essential loss functions: MSE, CrossEntropy, and BinaryCrossEntropy\n",
+    "- Build evaluation metrics for classification and regression\n",
+    "- Create a complete training loop that orchestrates the entire process\n",
+    "- Master checkpointing and model persistence for real-world deployment\n",
+    "\n",
+    "## Build → Use → Optimize\n",
+    "1. **Build**: Loss functions, metrics, and training orchestration\n",
+    "2. **Use**: Train complete models on real datasets\n",
+    "3. **Optimize**: Analyze training dynamics and improve performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64a9711c",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "training-imports",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| default_exp core.training\n",
+    "\n",
+    "#| export\n",
+    "import numpy as np\n",
+    "import sys\n",
+    "import os\n",
+    "import pickle\n",
+    "import json\n",
+    "from pathlib import Path\n",
+    "from typing import List, Dict, Any, Optional, Union, Callable, Tuple\n",
+    "from collections import defaultdict\n",
+    "import time\n",
+    "\n",
+    "# Helper function to set up import paths\n",
+    "def setup_import_paths():\n",
+    "    \"\"\"Set up import paths for development modules.\"\"\"\n",
+    "    import sys\n",
+    "    import os\n",
+    "    \n",
+    "    # Add module directories to path\n",
+    "    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
+    "    module_dirs = [\n",
+    "        '01_tensor', '02_activations', '03_layers', '04_networks', \n",
+    "        '05_cnn', '06_dataloader', '07_autograd', '08_optimizers'\n",
+    "    ]\n",
+    "    \n",
+    "    for module_dir in module_dirs:\n",
+    "        sys.path.append(os.path.join(base_dir, module_dir))\n",
+    "\n",
+    "# Set up paths\n",
+    "setup_import_paths()\n",
+    "\n",
+    "# Import all the building blocks we need\n",
+    "try:\n",
+    "    from tinytorch.core.tensor import Tensor\n",
+    "    from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax\n",
+    "    from tinytorch.core.layers import Dense\n",
+    "    from tinytorch.core.networks import Sequential, create_mlp\n",
+    "    from tinytorch.core.cnn import Conv2D, flatten\n",
+    "    from tinytorch.core.dataloader import Dataset, DataLoader\n",
+    "    from tinytorch.core.autograd import Variable\n",
+    "    from tinytorch.core.optimizers import SGD, Adam, StepLR\n",
+    "except ImportError:\n",
+    "    # For development, create mock classes or import from local modules\n",
+    "    try:\n",
+    "        from tensor_dev import Tensor\n",
+    "        from activations_dev import ReLU, Sigmoid, Tanh, Softmax\n",
+    "        from layers_dev import Dense\n",
+    "        from networks_dev import Sequential, create_mlp\n",
+    "        from cnn_dev import Conv2D, flatten\n",
+    "        from dataloader_dev import Dataset, DataLoader\n",
+    "        from autograd_dev import Variable\n",
+    "        from optimizers_dev import SGD, Adam, StepLR\n",
+    "    except ImportError:\n",
+    "        # Create minimal mock classes for development\n",
+    "        class Tensor:\n",
+    "            def __init__(self, data):\n",
+    "                self.data = np.array(data)\n",
+    "            def __str__(self):\n",
+    "                return f\"Tensor({self.data})\"\n",
+    "        \n",
+    "        class Variable:\n",
+    "            def __init__(self, data, requires_grad=True):\n",
+    "                self.data = Tensor(data)\n",
+    "                self.requires_grad = requires_grad\n",
+    "                self.grad = None\n",
+    "            \n",
+    "            def zero_grad(self):\n",
+    "                self.grad = None\n",
+    "            \n",
+    "            def backward(self):\n",
+    "                if self.requires_grad:\n",
+    "                    self.grad = Variable(1.0, requires_grad=False)\n",
+    "            \n",
+    "            def __str__(self):\n",
+    "                return f\"Variable({self.data})\"\n",
+    "        \n",
+    "        class SGD:\n",
+    "            def __init__(self, parameters, learning_rate=0.01):\n",
+    "                self.parameters = parameters\n",
+    "                self.learning_rate = learning_rate\n",
+    "            \n",
+    "            def zero_grad(self):\n",
+    "                for param in self.parameters:\n",
+    "                    if hasattr(param, 'zero_grad'):\n",
+    "                        param.zero_grad()\n",
+    "            \n",
+    "            def step(self):\n",
+    "                pass\n",
+    "        \n",
+    "        class Sequential:\n",
+    "            def __init__(self, layers=None):\n",
+    "                self.layers = layers or []\n",
+    "            \n",
+    "            def __call__(self, x):\n",
+    "                for layer in self.layers:\n",
+    "                    x = layer(x)\n",
+    "                return x\n",
+    "        \n",
+    "        class DataLoader:\n",
+    "            def __init__(self, dataset, batch_size=32, shuffle=True):\n",
+    "                self.dataset = dataset\n",
+    "                self.batch_size = batch_size\n",
+    "                self.shuffle = shuffle\n",
+    "            \n",
+    "            def __iter__(self):\n",
+    "                return iter([(Tensor([1, 2, 3]), Tensor([0]))])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6f9d264",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "training-setup",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "def _should_show_plots():\n",
+    "    \"\"\"Check if we should show plots (disable during testing)\"\"\"\n",
+    "    # Check multiple conditions that indicate we're in test mode\n",
+    "    is_pytest = (\n",
+    "        'pytest' in sys.modules or\n",
+    "        'test' in sys.argv or\n",
+    "        os.environ.get('PYTEST_CURRENT_TEST') is not None or\n",
+    "        any('test' in arg for arg in sys.argv) or\n",
+    "        any('pytest' in arg for arg in sys.argv)\n",
+    "    )\n",
+    "    \n",
+    "    # Show plots in development mode (when not in test mode)\n",
+    "    return not is_pytest"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9781368",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "## Step 1: Understanding Loss Functions\n",
+    "\n",
+    "### What are Loss Functions?\n",
+    "Loss functions measure how far our model's predictions are from the true values. They provide the \"signal\" that tells our optimizer which direction to update parameters.\n",
+    "\n",
+    "### The Mathematical Foundation\n",
+    "Training a neural network is an optimization problem:\n",
+    "```\n",
+    "θ* = argmin_θ L(f(x; θ), y)\n",
+    "```\n",
+    "Where:\n",
+    "- `θ` = model parameters (weights and biases)\n",
+    "- `f(x; θ)` = model predictions\n",
+    "- `y` = true labels\n",
+    "- `L` = loss function\n",
+    "- `θ*` = optimal parameters\n",
+    "\n",
+    "### Why Loss Functions Matter\n",
+    "- **Optimization target**: They define what \"good\" means for our model\n",
+    "- **Gradient source**: Provide gradients for backpropagation\n",
+    "- **Task-specific**: Different losses for different problems\n",
+    "- **Training dynamics**: Shape how the model learns\n",
+    "\n",
+    "### Common Loss Functions\n",
+    "\n",
+    "#### **Mean Squared Error (MSE)** - For Regression\n",
+    "```\n",
+    "MSE = (1/n) * Σ(y_pred - y_true)²\n",
+    "```\n",
+    "- **Use case**: Regression problems\n",
+    "- **Properties**: Penalizes large errors heavily\n",
+    "- **Gradient**: 2 * (y_pred - y_true)\n",
+    "\n",
+    "#### **Cross-Entropy Loss** - For Classification\n",
+    "```\n",
+    "CrossEntropy = -Σ y_true * log(y_pred)\n",
+    "```\n",
+    "- **Use case**: Multi-class classification\n",
+    "- **Properties**: Penalizes confident wrong predictions\n",
+    "- **Gradient**: y_pred - y_true (with softmax)\n",
+    "\n",
+    "#### **Binary Cross-Entropy** - For Binary Classification\n",
+    "```\n",
+    "BCE = -y_true * log(y_pred) - (1-y_true) * log(1-y_pred)\n",
+    "```\n",
+    "- **Use case**: Binary classification\n",
+    "- **Properties**: Symmetric around 0.5\n",
+    "- **Gradient**: (y_pred - y_true) / (y_pred * (1-y_pred))\n",
+    "\n",
+    "Let's implement these essential loss functions!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48d5af8d",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "mse-loss",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class MeanSquaredError:\n",
+    "    \"\"\"\n",
+    "    Mean Squared Error Loss for Regression\n",
+    "    \n",
+    "    Measures the average squared difference between predictions and targets.\n",
+    "    MSE = (1/n) * Σ(y_pred - y_true)²\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize MSE loss function.\"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor:\n",
+    "        \"\"\"\n",
+    "        Compute MSE loss between predictions and targets.\n",
+    "        \n",
+    "        Args:\n",
+    "            y_pred: Model predictions (shape: [batch_size, ...])\n",
+    "            y_true: True targets (shape: [batch_size, ...])\n",
+    "            \n",
+    "        Returns:\n",
+    "            Scalar loss value\n",
+    "            \n",
+    "        TODO: Implement Mean Squared Error loss computation.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Compute difference: diff = y_pred - y_true\n",
+    "        2. Square the differences: squared_diff = diff²\n",
+    "        3. Take mean over all elements: mean(squared_diff)\n",
+    "        4. Return as scalar Tensor\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
+    "        y_true = Tensor([[1.5, 2.5], [2.5, 3.5]])\n",
+    "        loss = mse_loss(y_pred, y_true)\n",
+    "        # Should return: mean([(1.0-1.5)², (2.0-2.5)², (3.0-2.5)², (4.0-3.5)²])\n",
+    "        #                = mean([0.25, 0.25, 0.25, 0.25]) = 0.25\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use tensor subtraction: y_pred - y_true\n",
+    "        - Use element-wise multiplication for squaring: diff * diff\n",
+    "        - Use np.mean() to get the average\n",
+    "        - Return Tensor(scalar_value)\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        # Compute difference\n",
+    "        diff = y_pred - y_true\n",
+    "        \n",
+    "        # Square the differences\n",
+    "        squared_diff = diff * diff\n",
+    "        \n",
+    "        # Take mean over all elements\n",
+    "        mean_loss = np.mean(squared_diff.data)\n",
+    "        \n",
+    "        return Tensor(mean_loss)\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:\n",
+    "        \"\"\"Alternative interface for forward pass.\"\"\"\n",
+    "        return self.__call__(y_pred, y_true)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f79180f",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: MSE Loss\n",
+    "\n",
+    "Let's test our MSE loss implementation with known values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c51bcfe",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "test-mse-loss",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_mse_loss_comprehensive():\n",
+    "    \"\"\"Test MSE loss with comprehensive examples.\"\"\"\n",
+    "    print(\"🔬 Unit Test: MSE Loss...\")\n",
+    "    \n",
+    "    mse = MeanSquaredError()\n",
+    "    \n",
+    "    # Test 1: Perfect predictions (loss should be 0)\n",
+    "    y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
+    "    y_true = Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
+    "    loss = mse(y_pred, y_true)\n",
+    "    assert abs(loss.data) < 1e-6, f\"Perfect predictions should have loss ≈ 0, got {loss.data}\"\n",
+    "    print(\"✅ Perfect predictions test passed\")\n",
+    "    \n",
+    "    # Test 2: Known loss computation\n",
+    "    y_pred = Tensor([[1.0, 2.0]])\n",
+    "    y_true = Tensor([[0.0, 1.0]])\n",
+    "    loss = mse(y_pred, y_true)\n",
+    "    expected = 1.0  # [(1-0)² + (2-1)²] / 2 = [1 + 1] / 2 = 1.0\n",
+    "    assert abs(loss.data - expected) < 1e-6, f\"Expected loss {expected}, got {loss.data}\"\n",
+    "    print(\"✅ Known loss computation test passed\")\n",
+    "    \n",
+    "    # Test 3: Batch processing\n",
+    "    y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
+    "    y_true = Tensor([[1.5, 2.5], [2.5, 3.5]])\n",
+    "    loss = mse(y_pred, y_true)\n",
+    "    expected = 0.25  # All squared differences are 0.25\n",
+    "    assert abs(loss.data - expected) < 1e-6, f\"Expected batch loss {expected}, got {loss.data}\"\n",
+    "    print(\"✅ Batch processing test passed\")\n",
+    "    \n",
+    "    # Test 4: Single value\n",
+    "    y_pred = Tensor([5.0])\n",
+    "    y_true = Tensor([3.0])\n",
+    "    loss = mse(y_pred, y_true)\n",
+    "    expected = 4.0  # (5-3)² = 4\n",
+    "    assert abs(loss.data - expected) < 1e-6, f\"Expected single value loss {expected}, got {loss.data}\"\n",
+    "    print(\"✅ Single value test passed\")\n",
+    "    \n",
+    "    print(\"🎯 MSE Loss: All tests passed!\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_mse_loss_comprehensive() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ab9848f",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "crossentropy-loss",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class CrossEntropyLoss:\n",
+    "    \"\"\"\n",
+    "    Cross-Entropy Loss for Multi-Class Classification\n",
+    "    \n",
+    "    Measures the difference between predicted probability distribution and true labels.\n",
+    "    CrossEntropy = -Σ y_true * log(y_pred)\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize CrossEntropy loss function.\"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor:\n",
+    "        \"\"\"\n",
+    "        Compute CrossEntropy loss between predictions and targets.\n",
+    "        \n",
+    "        Args:\n",
+    "            y_pred: Model predictions (shape: [batch_size, num_classes])\n",
+    "            y_true: True class indices (shape: [batch_size]) or one-hot (shape: [batch_size, num_classes])\n",
+    "            \n",
+    "        Returns:\n",
+    "            Scalar loss value\n",
+    "            \n",
+    "        TODO: Implement Cross-Entropy loss computation.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Handle both class indices and one-hot encoded labels\n",
+    "        2. Apply softmax to predictions for probability distribution\n",
+    "        3. Compute log probabilities: log(softmax(y_pred))\n",
+    "        4. Calculate cross-entropy: -mean(y_true * log_probs)\n",
+    "        5. Return scalar loss\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        y_pred = Tensor([[2.0, 1.0, 0.1], [0.5, 2.1, 0.9]])  # Raw logits\n",
+    "        y_true = Tensor([0, 1])  # Class indices\n",
+    "        loss = crossentropy_loss(y_pred, y_true)\n",
+    "        # Should apply softmax then compute -log(prob_of_correct_class)\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use softmax: exp(x) / sum(exp(x)) for probability distribution\n",
+    "        - Add small epsilon (1e-15) to avoid log(0)\n",
+    "        - Handle both class indices and one-hot encoding\n",
+    "        - Use np.log for logarithm computation\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        # Handle both 1D and 2D prediction arrays\n",
+    "        if y_pred.data.ndim == 1:\n",
+    "            # Reshape 1D to 2D for consistency (single sample)\n",
+    "            y_pred_2d = y_pred.data.reshape(1, -1)\n",
+    "        else:\n",
+    "            y_pred_2d = y_pred.data\n",
+    "            \n",
+    "        # Apply softmax to get probability distribution\n",
+    "        exp_pred = np.exp(y_pred_2d - np.max(y_pred_2d, axis=1, keepdims=True))\n",
+    "        softmax_pred = exp_pred / np.sum(exp_pred, axis=1, keepdims=True)\n",
+    "        \n",
+    "        # Add small epsilon to avoid log(0)\n",
+    "        epsilon = 1e-15\n",
+    "        softmax_pred = np.clip(softmax_pred, epsilon, 1.0 - epsilon)\n",
+    "        \n",
+    "        # Handle class indices vs one-hot encoding\n",
+    "        if len(y_true.data.shape) == 1:\n",
+    "            # y_true contains class indices\n",
+    "            batch_size = y_true.data.shape[0]\n",
+    "            log_probs = np.log(softmax_pred[np.arange(batch_size), y_true.data.astype(int)])\n",
+    "            loss = -np.mean(log_probs)\n",
+    "        else:\n",
+    "            # y_true is one-hot encoded\n",
+    "            log_probs = np.log(softmax_pred)\n",
+    "            loss = -np.mean(np.sum(y_true.data * log_probs, axis=1))\n",
+    "        \n",
+    "        return Tensor(loss)\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:\n",
+    "        \"\"\"Alternative interface for forward pass.\"\"\"\n",
+    "        return self.__call__(y_pred, y_true)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf9e9de4",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: CrossEntropy Loss\n",
+    "\n",
+    "Let's test our CrossEntropy loss implementation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c387203d",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "test-crossentropy-loss",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_crossentropy_loss_comprehensive():\n",
+    "    \"\"\"Test CrossEntropy loss with comprehensive examples.\"\"\"\n",
+    "    print(\"🔬 Unit Test: CrossEntropy Loss...\")\n",
+    "    \n",
+    "    ce = CrossEntropyLoss()\n",
+    "    \n",
+    "    # Test 1: Perfect predictions\n",
+    "    y_pred = Tensor([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0]])  # Very confident correct predictions\n",
+    "    y_true = Tensor([0, 1])  # Class indices\n",
+    "    loss = ce(y_pred, y_true)\n",
+    "    assert loss.data < 0.1, f\"Perfect predictions should have low loss, got {loss.data}\"\n",
+    "    print(\"✅ Perfect predictions test passed\")\n",
+    "    \n",
+    "    # Test 2: Random predictions (should have higher loss)\n",
+    "    y_pred = Tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])  # Uniform after softmax\n",
+    "    y_true = Tensor([0, 1])\n",
+    "    loss = ce(y_pred, y_true)\n",
+    "    expected_random = -np.log(1.0/3.0)  # log(1/num_classes) for uniform distribution\n",
+    "    assert abs(loss.data - expected_random) < 0.1, f\"Random predictions should have loss ≈ {expected_random}, got {loss.data}\"\n",
+    "    print(\"✅ Random predictions test passed\")\n",
+    "    \n",
+    "    # Test 3: Binary classification\n",
+    "    y_pred = Tensor([[2.0, 1.0], [1.0, 2.0]])\n",
+    "    y_true = Tensor([0, 1])\n",
+    "    loss = ce(y_pred, y_true)\n",
+    "    assert 0.0 < loss.data < 2.0, f\"Binary classification loss should be reasonable, got {loss.data}\"\n",
+    "    print(\"✅ Binary classification test passed\")\n",
+    "    \n",
+    "    # Test 4: One-hot encoded labels\n",
+    "    y_pred = Tensor([[2.0, 1.0, 0.0], [0.0, 2.0, 1.0]])\n",
+    "    y_true = Tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])  # One-hot encoded\n",
+    "    loss = ce(y_pred, y_true)\n",
+    "    assert 0.0 < loss.data < 2.0, f\"One-hot encoded loss should be reasonable, got {loss.data}\"\n",
+    "    print(\"✅ One-hot encoded labels test passed\")\n",
+    "    \n",
+    "    print(\"🎯 CrossEntropy Loss: All tests passed!\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_crossentropy_loss_comprehensive()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee6783c1",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "binary-crossentropy-loss",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class BinaryCrossEntropyLoss:\n",
+    "    \"\"\"\n",
+    "    Binary Cross-Entropy Loss for Binary Classification\n",
+    "    \n",
+    "    Measures the difference between predicted probabilities and binary labels.\n",
+    "    BCE = -y_true * log(y_pred) - (1-y_true) * log(1-y_pred)\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize Binary CrossEntropy loss function.\"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor:\n",
+    "        \"\"\"\n",
+    "        Compute Binary CrossEntropy loss between predictions and targets.\n",
+    "        \n",
+    "        Args:\n",
+    "            y_pred: Model predictions (shape: [batch_size, 1] or [batch_size])\n",
+    "            y_true: True binary labels (shape: [batch_size, 1] or [batch_size])\n",
+    "            \n",
+    "        Returns:\n",
+    "            Scalar loss value\n",
+    "            \n",
+    "        TODO: Implement Binary Cross-Entropy loss computation.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Apply sigmoid to predictions for probability values\n",
+    "        2. Clip probabilities to avoid log(0) and log(1)\n",
+    "        3. Compute: -y_true * log(y_pred) - (1-y_true) * log(1-y_pred)\n",
+    "        4. Take mean over batch\n",
+    "        5. Return scalar loss\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        y_pred = Tensor([[2.0], [0.0], [-1.0]])  # Raw logits\n",
+    "        y_true = Tensor([[1.0], [1.0], [0.0]])   # Binary labels\n",
+    "        loss = bce_loss(y_pred, y_true)\n",
+    "        # Should apply sigmoid then compute binary cross-entropy\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use sigmoid: 1 / (1 + exp(-x))\n",
+    "        - Clip probabilities: np.clip(probs, epsilon, 1-epsilon)\n",
+    "        - Handle both [batch_size] and [batch_size, 1] shapes\n",
+    "        - Use np.log for logarithm computation\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        # Use numerically stable implementation directly from logits\n",
+    "        # This avoids computing sigmoid and log separately\n",
+    "        logits = y_pred.data.flatten()\n",
+    "        labels = y_true.data.flatten()\n",
+    "        \n",
+    "        # Numerically stable binary cross-entropy from logits\n",
+    "        # Uses the identity: log(1 + exp(x)) = max(x, 0) + log(1 + exp(-abs(x)))\n",
+    "        def stable_bce_with_logits(logits, labels):\n",
+    "            # For each sample: -[y*log(sigmoid(x)) + (1-y)*log(1-sigmoid(x))]\n",
+    "            # Which equals: -[y*log_sigmoid(x) + (1-y)*log_sigmoid(-x)]\n",
+    "            # Where log_sigmoid(x) = x - log(1 + exp(x)) = x - softplus(x)\n",
+    "            \n",
+    "            # Compute log(sigmoid(x)) = x - log(1 + exp(x))\n",
+    "            # Use numerical stability: log(1 + exp(x)) = max(0, x) + log(1 + exp(-abs(x)))\n",
+    "            def log_sigmoid(x):\n",
+    "                return x - np.maximum(0, x) - np.log(1 + np.exp(-np.abs(x)))\n",
+    "            \n",
+    "            # Compute log(1 - sigmoid(x)) = -x - log(1 + exp(-x))\n",
+    "            def log_one_minus_sigmoid(x):\n",
+    "                return -x - np.maximum(0, -x) - np.log(1 + np.exp(-np.abs(x)))\n",
+    "            \n",
+    "            # Binary cross-entropy: -[y*log_sigmoid(x) + (1-y)*log_sigmoid(-x)]\n",
+    "            loss = -(labels * log_sigmoid(logits) + (1 - labels) * log_one_minus_sigmoid(logits))\n",
+    "            return loss\n",
+    "        \n",
+    "        # Compute loss for each sample\n",
+    "        losses = stable_bce_with_logits(logits, labels)\n",
+    "        \n",
+    "        # Take mean over batch\n",
+    "        mean_loss = np.mean(losses)\n",
+    "        \n",
+    "        return Tensor(mean_loss)\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:\n",
+    "        \"\"\"Alternative interface for forward pass.\"\"\"\n",
+    "        return self.__call__(y_pred, y_true)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb766718",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Binary CrossEntropy Loss\n",
+    "\n",
+    "Let's test our Binary CrossEntropy loss implementation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec3284a8",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "test-binary-crossentropy-loss",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_binary_crossentropy_loss_comprehensive():\n",
+    "    \"\"\"Test Binary CrossEntropy loss with comprehensive examples.\"\"\"\n",
+    "    print(\"🔬 Unit Test: Binary CrossEntropy Loss...\")\n",
+    "    \n",
+    "    bce = BinaryCrossEntropyLoss()\n",
+    "    \n",
+    "    # Test 1: Perfect predictions\n",
+    "    y_pred = Tensor([[10.0], [-10.0]])  # Very confident correct predictions\n",
+    "    y_true = Tensor([[1.0], [0.0]])\n",
+    "    loss = bce(y_pred, y_true)\n",
+    "    assert loss.data < 0.1, f\"Perfect predictions should have low loss, got {loss.data}\"\n",
+    "    print(\"✅ Perfect predictions test passed\")\n",
+    "    \n",
+    "    # Test 2: Random predictions (should have higher loss)\n",
+    "    y_pred = Tensor([[0.0], [0.0]])  # 0.5 probability after sigmoid\n",
+    "    y_true = Tensor([[1.0], [0.0]])\n",
+    "    loss = bce(y_pred, y_true)\n",
+    "    expected_random = -np.log(0.5)  # log(0.5) for random guessing\n",
+    "    assert abs(loss.data - expected_random) < 0.1, f\"Random predictions should have loss ≈ {expected_random}, got {loss.data}\"\n",
+    "    print(\"✅ Random predictions test passed\")\n",
+    "    \n",
+    "    # Test 3: Batch processing\n",
+    "    y_pred = Tensor([[1.0], [2.0], [-1.0]])\n",
+    "    y_true = Tensor([[1.0], [1.0], [0.0]])\n",
+    "    loss = bce(y_pred, y_true)\n",
+    "    assert 0.0 < loss.data < 2.0, f\"Batch processing loss should be reasonable, got {loss.data}\"\n",
+    "    print(\"✅ Batch processing test passed\")\n",
+    "    \n",
+    "    # Test 4: Edge cases\n",
+    "    y_pred = Tensor([[100.0], [-100.0]])  # Extreme values\n",
+    "    y_true = Tensor([[1.0], [0.0]])\n",
+    "    loss = bce(y_pred, y_true)\n",
+    "    assert loss.data < 0.1, f\"Extreme correct predictions should have low loss, got {loss.data}\"\n",
+    "    print(\"✅ Edge cases test passed\")\n",
+    "    \n",
+    "    print(\"🎯 Binary CrossEntropy Loss: All tests passed!\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_binary_crossentropy_loss_comprehensive() "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b37b9c54",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "## Step 2: Understanding Metrics\n",
+    "\n",
+    "### What are Metrics?\n",
+    "Metrics are measurements that help us understand how well our model is performing. Unlike loss functions, metrics are often more interpretable and align with business objectives.\n",
+    "\n",
+    "### Key Metrics for Classification\n",
+    "\n",
+    "#### **Accuracy**\n",
+    "```\n",
+    "Accuracy = (Correct Predictions) / (Total Predictions)\n",
+    "```\n",
+    "- **Range**: [0, 1]\n",
+    "- **Interpretation**: Percentage of correct predictions\n",
+    "- **Good for**: Balanced datasets\n",
+    "\n",
+    "#### **Precision**\n",
+    "```\n",
+    "Precision = True Positives / (True Positives + False Positives)\n",
+    "```\n",
+    "- **Range**: [0, 1]\n",
+    "- **Interpretation**: Of all positive predictions, how many were correct?\n",
+    "- **Good for**: When false positives are costly\n",
+    "\n",
+    "#### **Recall (Sensitivity)**\n",
+    "```\n",
+    "Recall = True Positives / (True Positives + False Negatives)\n",
+    "```\n",
+    "- **Range**: [0, 1]\n",
+    "- **Interpretation**: Of all actual positives, how many did we find?\n",
+    "- **Good for**: When false negatives are costly\n",
+    "\n",
+    "### Key Metrics for Regression\n",
+    "\n",
+    "#### **Mean Absolute Error (MAE)**\n",
+    "```\n",
+    "MAE = (1/n) * Σ|y_pred - y_true|\n",
+    "```\n",
+    "- **Range**: [0, ∞)\n",
+    "- **Interpretation**: Average absolute error\n",
+    "- **Good for**: Robust to outliers\n",
+    "\n",
+    "Let's implement these essential metrics!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3caa1ff3",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "accuracy-metric",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class Accuracy:\n",
+    "    \"\"\"\n",
+    "    Accuracy Metric for Classification\n",
+    "    \n",
+    "    Computes the fraction of correct predictions.\n",
+    "    Accuracy = (Correct Predictions) / (Total Predictions)\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize Accuracy metric.\"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:\n",
+    "        \"\"\"\n",
+    "        Compute accuracy between predictions and targets.\n",
+    "        \n",
+    "        Args:\n",
+    "            y_pred: Model predictions (shape: [batch_size, num_classes] or [batch_size])\n",
+    "            y_true: True class labels (shape: [batch_size] or [batch_size])\n",
+    "            \n",
+    "        Returns:\n",
+    "            Accuracy as a float value between 0 and 1\n",
+    "            \n",
+    "        TODO: Implement accuracy computation.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Convert predictions to class indices (argmax for multi-class)\n",
+    "        2. Convert true labels to class indices if needed\n",
+    "        3. Count correct predictions\n",
+    "        4. Divide by total predictions\n",
+    "        5. Return as float\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        y_pred = Tensor([[0.9, 0.1], [0.2, 0.8], [0.6, 0.4]])  # Probabilities\n",
+    "        y_true = Tensor([0, 1, 0])  # True classes\n",
+    "        accuracy = accuracy_metric(y_pred, y_true)\n",
+    "        # Should return: 2/3 = 0.667 (first and second predictions correct)\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use np.argmax(axis=1) for multi-class predictions\n",
+    "        - Handle both probability and class index inputs\n",
+    "        - Use np.mean() for averaging\n",
+    "        - Return Python float, not Tensor\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        # Convert predictions to class indices\n",
+    "        if len(y_pred.data.shape) > 1 and y_pred.data.shape[1] > 1:\n",
+    "            # Multi-class: use argmax\n",
+    "            pred_classes = np.argmax(y_pred.data, axis=1)\n",
+    "        else:\n",
+    "            # Binary classification: threshold at 0.5\n",
+    "            pred_classes = (y_pred.data.flatten() > 0.5).astype(int)\n",
+    "        \n",
+    "        # Convert true labels to class indices if needed\n",
+    "        if len(y_true.data.shape) > 1 and y_true.data.shape[1] > 1:\n",
+    "            # One-hot encoded\n",
+    "            true_classes = np.argmax(y_true.data, axis=1)\n",
+    "        else:\n",
+    "            # Already class indices\n",
+    "            true_classes = y_true.data.flatten().astype(int)\n",
+    "        \n",
+    "        # Compute accuracy\n",
+    "        correct = np.sum(pred_classes == true_classes)\n",
+    "        total = len(true_classes)\n",
+    "        accuracy = correct / total\n",
+    "        \n",
+    "        return float(accuracy)\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def forward(self, y_pred: Tensor, y_true: Tensor) -> float:\n",
+    "        \"\"\"Alternative interface for forward pass.\"\"\"\n",
+    "        return self.__call__(y_pred, y_true)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "baaa749e",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Accuracy Metric\n",
+    "\n",
+    "Let's test our Accuracy metric implementation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d50dea15",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "test-accuracy-metric",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_accuracy_metric_comprehensive():\n",
+    "    \"\"\"Test Accuracy metric with comprehensive examples.\"\"\"\n",
+    "    print(\"🔬 Unit Test: Accuracy Metric...\")\n",
+    "    \n",
+    "    accuracy = Accuracy()\n",
+    "    \n",
+    "    # Test 1: Perfect predictions\n",
+    "    y_pred = Tensor([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2]])\n",
+    "    y_true = Tensor([0, 1, 0])\n",
+    "    acc = accuracy(y_pred, y_true)\n",
+    "    assert acc == 1.0, f\"Perfect predictions should have accuracy 1.0, got {acc}\"\n",
+    "    print(\"✅ Perfect predictions test passed\")\n",
+    "    \n",
+    "    # Test 2: Half correct\n",
+    "    y_pred = Tensor([[0.9, 0.1], [0.9, 0.1], [0.8, 0.2]])  # All predict class 0\n",
+    "    y_true = Tensor([0, 1, 0])  # Classes: 0, 1, 0\n",
+    "    acc = accuracy(y_pred, y_true)\n",
+    "    expected = 2.0/3.0  # 2 out of 3 correct\n",
+    "    assert abs(acc - expected) < 1e-6, f\"Half correct should have accuracy {expected}, got {acc}\"\n",
+    "    print(\"✅ Half correct test passed\")\n",
+    "    \n",
+    "    # Test 3: Binary classification\n",
+    "    y_pred = Tensor([[0.8], [0.3], [0.9], [0.1]])  # Predictions above/below 0.5\n",
+    "    y_true = Tensor([1, 0, 1, 0])\n",
+    "    acc = accuracy(y_pred, y_true)\n",
+    "    assert acc == 1.0, f\"Binary classification should have accuracy 1.0, got {acc}\"\n",
+    "    print(\"✅ Binary classification test passed\")\n",
+    "    \n",
+    "    # Test 4: Multi-class\n",
+    "    y_pred = Tensor([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.1, 0.1, 0.8]])\n",
+    "    y_true = Tensor([0, 1, 2])\n",
+    "    acc = accuracy(y_pred, y_true)\n",
+    "    assert acc == 1.0, f\"Multi-class should have accuracy 1.0, got {acc}\"\n",
+    "    print(\"✅ Multi-class test passed\")\n",
+    "    \n",
+    "    print(\"🎯 Accuracy Metric: All tests passed!\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_accuracy_metric_comprehensive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e0ffdba",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "## Step 3: Building the Training Loop\n",
+    "\n",
+    "### What is a Training Loop?\n",
+    "A training loop is the orchestration logic that coordinates all components of neural network training:\n",
+    "\n",
+    "1. **Forward Pass**: Compute predictions\n",
+    "2. **Loss Computation**: Measure prediction quality\n",
+    "3. **Backward Pass**: Compute gradients\n",
+    "4. **Parameter Update**: Update model parameters\n",
+    "5. **Evaluation**: Compute metrics and validation performance\n",
+    "\n",
+    "### The Training Loop Architecture\n",
+    "```python\n",
+    "for epoch in range(num_epochs):\n",
+    "    # Training phase\n",
+    "    for batch in train_dataloader:\n",
+    "        optimizer.zero_grad()\n",
+    "        predictions = model(batch_x)\n",
+    "        loss = loss_function(predictions, batch_y)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "    \n",
+    "    # Validation phase\n",
+    "    for batch in val_dataloader:\n",
+    "        predictions = model(batch_x)\n",
+    "        val_loss = loss_function(predictions, batch_y)\n",
+    "        accuracy = accuracy_metric(predictions, batch_y)\n",
+    "```\n",
+    "\n",
+    "### Why We Need a Trainer Class\n",
+    "- **Encapsulation**: Keeps training logic organized\n",
+    "- **Reusability**: Same trainer works with different models/datasets\n",
+    "- **Monitoring**: Built-in logging and progress tracking\n",
+    "- **Flexibility**: Easy to modify training behavior\n",
+    "\n",
+    "Let's build our Trainer class!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a86547f1",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "trainer-class",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class Trainer:\n",
+    "    \"\"\"\n",
+    "    Training Loop Orchestrator\n",
+    "    \n",
+    "    Coordinates model training with loss functions, optimizers, and metrics.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, model, optimizer, loss_function, metrics=None):\n",
+    "        \"\"\"\n",
+    "        Initialize trainer with model and training components.\n",
+    "        \n",
+    "        Args:\n",
+    "            model: Neural network model to train\n",
+    "            optimizer: Optimizer for parameter updates\n",
+    "            loss_function: Loss function for training\n",
+    "            metrics: List of metrics to track (optional)\n",
+    "            \n",
+    "        TODO: Initialize the trainer with all necessary components.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Store model, optimizer, loss function, and metrics\n",
+    "        2. Initialize history tracking for losses and metrics\n",
+    "        3. Set up training state (epoch, step counters)\n",
+    "        4. Prepare for training and validation loops\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        model = Sequential([Dense(10, 5), ReLU(), Dense(5, 2)])\n",
+    "        optimizer = Adam(model.parameters, learning_rate=0.001)\n",
+    "        loss_fn = CrossEntropyLoss()\n",
+    "        metrics = [Accuracy()]\n",
+    "        trainer = Trainer(model, optimizer, loss_fn, metrics)\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Store all components as instance variables\n",
+    "        - Initialize empty history dictionaries\n",
+    "        - Set metrics to empty list if None provided\n",
+    "        - Initialize epoch and step counters to 0\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        self.model = model\n",
+    "        self.optimizer = optimizer\n",
+    "        self.loss_function = loss_function\n",
+    "        self.metrics = metrics or []\n",
+    "        \n",
+    "        # Training history\n",
+    "        self.history = {\n",
+    "            'train_loss': [],\n",
+    "            'val_loss': [],\n",
+    "            'epoch': []\n",
+    "        }\n",
+    "        \n",
+    "        # Add metric history tracking\n",
+    "        for metric in self.metrics:\n",
+    "            metric_name = metric.__class__.__name__.lower()\n",
+    "            self.history[f'train_{metric_name}'] = []\n",
+    "            self.history[f'val_{metric_name}'] = []\n",
+    "        \n",
+    "        # Training state\n",
+    "        self.current_epoch = 0\n",
+    "        self.current_step = 0\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def train_epoch(self, dataloader):\n",
+    "        \"\"\"\n",
+    "        Train for one epoch on the given dataloader.\n",
+    "        \n",
+    "        Args:\n",
+    "            dataloader: DataLoader containing training data\n",
+    "            \n",
+    "        Returns:\n",
+    "            Dictionary with epoch training metrics\n",
+    "            \n",
+    "        TODO: Implement single epoch training logic.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Initialize epoch metrics tracking\n",
+    "        2. Iterate through batches in dataloader\n",
+    "        3. For each batch:\n",
+    "           - Zero gradients\n",
+    "           - Forward pass\n",
+    "           - Compute loss\n",
+    "           - Backward pass\n",
+    "           - Update parameters\n",
+    "           - Track metrics\n",
+    "        4. Return averaged metrics for the epoch\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use optimizer.zero_grad() before each batch\n",
+    "        - Call loss.backward() for gradient computation\n",
+    "        - Use optimizer.step() for parameter updates\n",
+    "        - Track running averages for metrics\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        epoch_metrics = {'loss': 0.0}\n",
+    "        \n",
+    "        # Initialize metric tracking\n",
+    "        for metric in self.metrics:\n",
+    "            metric_name = metric.__class__.__name__.lower()\n",
+    "            epoch_metrics[metric_name] = 0.0\n",
+    "        \n",
+    "        batch_count = 0\n",
+    "        \n",
+    "        for batch_x, batch_y in dataloader:\n",
+    "            # Zero gradients\n",
+    "            self.optimizer.zero_grad()\n",
+    "            \n",
+    "            # Forward pass\n",
+    "            predictions = self.model(batch_x)\n",
+    "            \n",
+    "            # Compute loss\n",
+    "            loss = self.loss_function(predictions, batch_y)\n",
+    "            \n",
+    "            # Backward pass (simplified - in real implementation would use autograd)\n",
+    "            # loss.backward()\n",
+    "            \n",
+    "            # Update parameters\n",
+    "            self.optimizer.step()\n",
+    "            \n",
+    "            # Track metrics\n",
+    "            epoch_metrics['loss'] += loss.data\n",
+    "            \n",
+    "            for metric in self.metrics:\n",
+    "                metric_name = metric.__class__.__name__.lower()\n",
+    "                metric_value = metric(predictions, batch_y)\n",
+    "                epoch_metrics[metric_name] += metric_value\n",
+    "            \n",
+    "            batch_count += 1\n",
+    "            self.current_step += 1\n",
+    "        \n",
+    "        # Average metrics over all batches\n",
+    "        for key in epoch_metrics:\n",
+    "            epoch_metrics[key] /= batch_count\n",
+    "        \n",
+    "        return epoch_metrics\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def validate_epoch(self, dataloader):\n",
+    "        \"\"\"\n",
+    "        Validate for one epoch on the given dataloader.\n",
+    "        \n",
+    "        Args:\n",
+    "            dataloader: DataLoader containing validation data\n",
+    "            \n",
+    "        Returns:\n",
+    "            Dictionary with epoch validation metrics\n",
+    "            \n",
+    "        TODO: Implement single epoch validation logic.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Initialize epoch metrics tracking\n",
+    "        2. Iterate through batches in dataloader\n",
+    "        3. For each batch:\n",
+    "           - Forward pass (no gradient computation)\n",
+    "           - Compute loss\n",
+    "           - Track metrics\n",
+    "        4. Return averaged metrics for the epoch\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - No gradient computation needed for validation\n",
+    "        - No parameter updates during validation\n",
+    "        - Similar to train_epoch but simpler\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        epoch_metrics = {'loss': 0.0}\n",
+    "        \n",
+    "        # Initialize metric tracking\n",
+    "        for metric in self.metrics:\n",
+    "            metric_name = metric.__class__.__name__.lower()\n",
+    "            epoch_metrics[metric_name] = 0.0\n",
+    "        \n",
+    "        batch_count = 0\n",
+    "        \n",
+    "        for batch_x, batch_y in dataloader:\n",
+    "            # Forward pass only (no gradients needed)\n",
+    "            predictions = self.model(batch_x)\n",
+    "            \n",
+    "            # Compute loss\n",
+    "            loss = self.loss_function(predictions, batch_y)\n",
+    "            \n",
+    "            # Track metrics\n",
+    "            epoch_metrics['loss'] += loss.data\n",
+    "            \n",
+    "            for metric in self.metrics:\n",
+    "                metric_name = metric.__class__.__name__.lower()\n",
+    "                metric_value = metric(predictions, batch_y)\n",
+    "                epoch_metrics[metric_name] += metric_value\n",
+    "            \n",
+    "            batch_count += 1\n",
+    "        \n",
+    "        # Average metrics over all batches\n",
+    "        for key in epoch_metrics:\n",
+    "            epoch_metrics[key] /= batch_count\n",
+    "        \n",
+    "        return epoch_metrics\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def fit(self, train_dataloader, val_dataloader=None, epochs=10, verbose=True):\n",
+    "        \"\"\"\n",
+    "        Train the model for specified number of epochs.\n",
+    "        \n",
+    "        Args:\n",
+    "            train_dataloader: Training data\n",
+    "            val_dataloader: Validation data (optional)\n",
+    "            epochs: Number of training epochs\n",
+    "            verbose: Whether to print training progress\n",
+    "            \n",
+    "        Returns:\n",
+    "            Training history dictionary\n",
+    "            \n",
+    "        TODO: Implement complete training loop.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Loop through epochs\n",
+    "        2. For each epoch:\n",
+    "           - Train on training data\n",
+    "           - Validate on validation data (if provided)\n",
+    "           - Update history\n",
+    "           - Print progress (if verbose)\n",
+    "        3. Return complete training history\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use train_epoch() and validate_epoch() methods\n",
+    "        - Update self.history with results\n",
+    "        - Print epoch summary if verbose=True\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        print(f\"Starting training for {epochs} epochs...\")\n",
+    "        \n",
+    "        for epoch in range(epochs):\n",
+    "            self.current_epoch = epoch\n",
+    "            \n",
+    "            # Training phase\n",
+    "            train_metrics = self.train_epoch(train_dataloader)\n",
+    "            \n",
+    "            # Validation phase\n",
+    "            val_metrics = {}\n",
+    "            if val_dataloader is not None:\n",
+    "                val_metrics = self.validate_epoch(val_dataloader)\n",
+    "            \n",
+    "            # Update history\n",
+    "            self.history['epoch'].append(epoch)\n",
+    "            self.history['train_loss'].append(train_metrics['loss'])\n",
+    "            \n",
+    "            if val_dataloader is not None:\n",
+    "                self.history['val_loss'].append(val_metrics['loss'])\n",
+    "            \n",
+    "            # Update metric history\n",
+    "            for metric in self.metrics:\n",
+    "                metric_name = metric.__class__.__name__.lower()\n",
+    "                self.history[f'train_{metric_name}'].append(train_metrics[metric_name])\n",
+    "                if val_dataloader is not None:\n",
+    "                    self.history[f'val_{metric_name}'].append(val_metrics[metric_name])\n",
+    "            \n",
+    "            # Print progress\n",
+    "            if verbose:\n",
+    "                train_loss = train_metrics['loss']\n",
+    "                print(f\"Epoch {epoch+1}/{epochs} - train_loss: {train_loss:.4f}\", end=\"\")\n",
+    "                \n",
+    "                if val_dataloader is not None:\n",
+    "                    val_loss = val_metrics['loss']\n",
+    "                    print(f\" - val_loss: {val_loss:.4f}\", end=\"\")\n",
+    "                \n",
+    "                for metric in self.metrics:\n",
+    "                    metric_name = metric.__class__.__name__.lower()\n",
+    "                    train_metric = train_metrics[metric_name]\n",
+    "                    print(f\" - train_{metric_name}: {train_metric:.4f}\", end=\"\")\n",
+    "                    \n",
+    "                    if val_dataloader is not None:\n",
+    "                        val_metric = val_metrics[metric_name]\n",
+    "                        print(f\" - val_{metric_name}: {val_metric:.4f}\", end=\"\")\n",
+    "                \n",
+    "                print()  # New line\n",
+    "        \n",
+    "        print(\"Training completed!\")\n",
+    "        return self.history\n",
+    "        ### END SOLUTION"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "624790e7",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Training Loop\n",
+    "\n",
+    "Let's test our Trainer class with a simple example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6de298d9",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "test-trainer",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_trainer_comprehensive():\n",
+    "    \"\"\"Test Trainer class with comprehensive examples.\"\"\"\n",
+    "    print(\"🔬 Unit Test: Trainer Class...\")\n",
+    "    \n",
+    "    # Create simple model and components\n",
+    "    model = Sequential([Dense(2, 3), ReLU(), Dense(3, 2)])  # Simple model\n",
+    "    optimizer = SGD([], learning_rate=0.01)  # Empty parameters list for testing\n",
+    "    loss_fn = MeanSquaredError()\n",
+    "    metrics = [Accuracy()]\n",
+    "    \n",
+    "    # Create trainer\n",
+    "    trainer = Trainer(model, optimizer, loss_fn, metrics)\n",
+    "    \n",
+    "    # Test 1: Trainer initialization\n",
+    "    assert trainer.model is model, \"Model should be stored correctly\"\n",
+    "    assert trainer.optimizer is optimizer, \"Optimizer should be stored correctly\"\n",
+    "    assert trainer.loss_function is loss_fn, \"Loss function should be stored correctly\"\n",
+    "    assert len(trainer.metrics) == 1, \"Metrics should be stored correctly\"\n",
+    "    assert 'train_loss' in trainer.history, \"Training history should be initialized\"\n",
+    "    print(\"✅ Trainer initialization test passed\")\n",
+    "    \n",
+    "    # Test 2: History structure\n",
+    "    assert 'epoch' in trainer.history, \"History should track epochs\"\n",
+    "    assert 'train_accuracy' in trainer.history, \"History should track training accuracy\"\n",
+    "    assert 'val_accuracy' in trainer.history, \"History should track validation accuracy\"\n",
+    "    print(\"✅ History structure test passed\")\n",
+    "    \n",
+    "    # Test 3: Training state\n",
+    "    assert trainer.current_epoch == 0, \"Current epoch should start at 0\"\n",
+    "    assert trainer.current_step == 0, \"Current step should start at 0\"\n",
+    "    print(\"✅ Training state test passed\")\n",
+    "    \n",
+    "    print(\"🎯 Trainer Class: All tests passed!\")\n",
+    "\n",
+    "# Run the test\n",
+    "test_trainer_comprehensive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c1378671",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Complete Training Comprehensive Test\n",
+    "\n",
+    "Let's test the complete training pipeline with all components working together.\n",
+    "\n",
+    "**This is a comprehensive test** - it tests all training components working together in a realistic scenario."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3629008b",
+   "metadata": {
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-training-comprehensive",
+     "locked": true,
+     "points": 25,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_training_comprehensive():\n",
+    "    \"\"\"Test complete training pipeline with all components.\"\"\"\n",
+    "    print(\"🔬 Comprehensive Test: Complete Training Pipeline...\")\n",
+    "    \n",
+    "    try:\n",
+    "        # Test 1: Loss functions work correctly\n",
+    "        mse = MeanSquaredError()\n",
+    "        ce = CrossEntropyLoss()\n",
+    "        bce = BinaryCrossEntropyLoss()\n",
+    "        \n",
+    "        # MSE test\n",
+    "        y_pred = Tensor([[1.0, 2.0]])\n",
+    "        y_true = Tensor([[1.0, 2.0]])\n",
+    "        loss = mse(y_pred, y_true)\n",
+    "        assert abs(loss.data) < 1e-6, \"MSE should work for perfect predictions\"\n",
+    "        \n",
+    "        # CrossEntropy test\n",
+    "        y_pred = Tensor([[10.0, 0.0], [0.0, 10.0]])\n",
+    "        y_true = Tensor([0, 1])\n",
+    "        loss = ce(y_pred, y_true)\n",
+    "        assert loss.data < 1.0, \"CrossEntropy should work for good predictions\"\n",
+    "        \n",
+    "        # Binary CrossEntropy test\n",
+    "        y_pred = Tensor([[10.0], [-10.0]])\n",
+    "        y_true = Tensor([[1.0], [0.0]])\n",
+    "        loss = bce(y_pred, y_true)\n",
+    "        assert loss.data < 1.0, \"Binary CrossEntropy should work for good predictions\"\n",
+    "        \n",
+    "        print(\"✅ Loss functions work correctly\")\n",
+    "        \n",
+    "        # Test 2: Metrics work correctly\n",
+    "        accuracy = Accuracy()\n",
+    "        \n",
+    "        y_pred = Tensor([[0.9, 0.1], [0.1, 0.9]])\n",
+    "        y_true = Tensor([0, 1])\n",
+    "        acc = accuracy(y_pred, y_true)\n",
+    "        assert acc == 1.0, \"Accuracy should work for perfect predictions\"\n",
+    "        \n",
+    "        print(\"✅ Metrics work correctly\")\n",
+    "        \n",
+    "        # Test 3: Trainer integrates all components\n",
+    "        model = Sequential([])  # Empty model for testing\n",
+    "        optimizer = SGD([], learning_rate=0.01)\n",
+    "        loss_fn = MeanSquaredError()\n",
+    "        metrics = [Accuracy()]\n",
+    "        \n",
+    "        trainer = Trainer(model, optimizer, loss_fn, metrics)\n",
+    "        \n",
+    "        # Check trainer setup\n",
+    "        assert trainer.model is model, \"Trainer should store model\"\n",
+    "        assert trainer.optimizer is optimizer, \"Trainer should store optimizer\"\n",
+    "        assert trainer.loss_function is loss_fn, \"Trainer should store loss function\"\n",
+    "        assert len(trainer.metrics) == 1, \"Trainer should store metrics\"\n",
+    "        \n",
+    "        print(\"✅ Trainer integrates all components\")\n",
+    "        \n",
+    "        print(\"🎉 Complete training pipeline works correctly!\")\n",
+    "        \n",
+    "        # Test 4: Integration works end-to-end\n",
+    "        print(\"✅ End-to-end integration successful\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Training pipeline test failed: {e}\")\n",
+    "        raise\n",
+    "    \n",
+    "    print(\"🎯 Training Pipeline: All comprehensive tests passed!\")\n",
+    "\n",
+    "# Run the comprehensive test\n",
+    "test_training_comprehensive()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c340ed28",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "## 🧪 Module Testing\n",
+    "\n",
+    "Time to test your implementation! This section uses TinyTorch's standardized testing framework to ensure your implementation works correctly.\n",
+    "\n",
+    "**This testing section is locked** - it provides consistent feedback across all modules and cannot be modified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d3380e1",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "standardized-testing",
+     "locked": true,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# STANDARDIZED MODULE TESTING - DO NOT MODIFY\n",
+    "# This cell is locked to ensure consistent testing across all TinyTorch modules\n",
+    "# =============================================================================\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    from tito.tools.testing import run_module_tests_auto\n",
+    "    \n",
+    "    # Automatically discover and run all tests in this module\n",
+    "    success = run_module_tests_auto(\"Training\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3578b933",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "## 🎯 Module Summary: Neural Network Training Mastery!\n",
+    "\n",
+    "Congratulations! You've successfully implemented the complete training system that powers modern neural networks:\n",
+    "\n",
+    "### ✅ What You've Built\n",
+    "- **Loss Functions**: MSE, CrossEntropy, BinaryCrossEntropy for different problem types\n",
+    "- **Metrics System**: Accuracy with extensible framework for additional metrics\n",
+    "- **Training Loop**: Complete Trainer class with epoch management and history tracking\n",
+    "- **Integration**: All components work together in a unified training pipeline\n",
+    "\n",
+    "### ✅ Key Learning Outcomes\n",
+    "- **Understanding**: How neural networks learn through loss optimization\n",
+    "- **Implementation**: Built complete training system from scratch\n",
+    "- **Mathematical mastery**: Loss functions, gradient computation, metric calculation\n",
+    "- **Real-world application**: Comprehensive training pipeline for production use\n",
+    "- **Systems thinking**: Modular design enabling flexible training configurations\n",
+    "\n",
+    "### ✅ Mathematical Foundations Mastered\n",
+    "- **Loss Functions**: Quantifying prediction quality for different problem types\n",
+    "- **Gradient Descent**: Iterative optimization through loss minimization\n",
+    "- **Metrics**: Performance evaluation beyond loss (accuracy, precision, recall)\n",
+    "- **Training Dynamics**: Epoch management, batch processing, validation monitoring\n",
+    "\n",
+    "### ✅ Professional Skills Developed\n",
+    "- **Software Architecture**: Modular, extensible training system design\n",
+    "- **API Design**: Clean interfaces for training configuration and monitoring\n",
+    "- **Performance Monitoring**: Comprehensive metrics tracking and history logging\n",
+    "- **Error Handling**: Robust training pipeline with proper error management\n",
+    "\n",
+    "### ✅ Ready for Advanced Applications\n",
+    "Your training system now enables:\n",
+    "- **Any Neural Network**: Train any architecture with any loss function\n",
+    "- **Multiple Problem Types**: Classification, regression, and custom objectives\n",
+    "- **Production Training**: Robust training loops with monitoring and checkpointing\n",
+    "- **Research Applications**: Flexible framework for experimenting with new methods\n",
+    "\n",
+    "### 🔗 Connection to Real ML Systems\n",
+    "Your implementation mirrors production frameworks:\n",
+    "- **PyTorch**: `torch.nn` loss functions and training loops\n",
+    "- **TensorFlow**: `tf.keras` training API and callbacks\n",
+    "- **JAX**: `optax` optimizers and training utilities\n",
+    "- **Industry Standard**: Core training concepts used in all major ML systems\n",
+    "\n",
+    "### 🎯 The Power of Systematic Training\n",
+    "You've built the orchestration system that makes ML possible:\n",
+    "- **Automation**: Handles complex training workflows automatically\n",
+    "- **Flexibility**: Supports any model architecture and training configuration\n",
+    "- **Monitoring**: Comprehensive tracking of training progress and performance\n",
+    "- **Reliability**: Robust error handling and validation throughout training\n",
+    "\n",
+    "### 🧠 Machine Learning Engineering\n",
+    "You now understand the engineering that makes AI systems work:\n",
+    "- **Training Pipelines**: End-to-end automated training workflows\n",
+    "- **Performance Monitoring**: Real-time feedback on model learning progress\n",
+    "- **Hyperparameter Management**: Systematic approach to training configuration\n",
+    "- **Production Readiness**: Scalable training systems for real-world deployment\n",
+    "\n",
+    "### 🚀 What's Next\n",
+    "Your training system is the foundation for:\n",
+    "- **Advanced Optimizers**: Adam, RMSprop, and specialized optimization methods\n",
+    "- **Regularization**: Dropout, weight decay, and overfitting prevention\n",
+    "- **Model Deployment**: Saving, loading, and serving trained models\n",
+    "- **MLOps**: Production training pipelines, monitoring, and continuous learning\n",
+    "\n",
+    "**Next Module**: Advanced training techniques, regularization, and production deployment!\n",
+    "\n",
+    "You've built the training engine that powers modern AI. Now let's add the advanced features that make it production-ready and capable of learning complex patterns from real-world data!"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "main_language": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/modules/source/09_training/training_dev.py b/modules/source/09_training/training_dev.py
index aed52bb6..137324ae 100644
--- a/modules/source/09_training/training_dev.py
+++ b/modules/source/09_training/training_dev.py
@@ -366,8 +366,15 @@ class CrossEntropyLoss:
         - Use np.log for logarithm computation
         """
         ### BEGIN SOLUTION
+        # Handle both 1D and 2D prediction arrays
+        if y_pred.data.ndim == 1:
+            # Reshape 1D to 2D for consistency (single sample)
+            y_pred_2d = y_pred.data.reshape(1, -1)
+        else:
+            y_pred_2d = y_pred.data
+            
         # Apply softmax to get probability distribution
-        exp_pred = np.exp(y_pred.data - np.max(y_pred.data, axis=1, keepdims=True))
+        exp_pred = np.exp(y_pred_2d - np.max(y_pred_2d, axis=1, keepdims=True))
         softmax_pred = exp_pred / np.sum(exp_pred, axis=1, keepdims=True)
         
         # Add small epsilon to avoid log(0)