diff --git a/modules/01_tensor/tensor_dev.py b/modules/01_tensor/tensor_dev.py
index 9025db0e..31e638e1 100644
--- a/modules/01_tensor/tensor_dev.py
+++ b/modules/01_tensor/tensor_dev.py
@@ -1410,7 +1410,7 @@ Your tensor implementation now enables:
 - **Real data processing**: Handle images, text, and complex multi-dimensional datasets
 
 ### Export Your Work
-1. **Export to package**: `tito module complete 02_tensor`
+1. **Export to package**: `tito module complete 01_tensor`
 2. **Verify integration**: Your Tensor class will be available as `tinytorch.core.tensor.Tensor`
 3. **Enable next module**: Activations build on your tensor foundation
 
diff --git a/modules/02_activations/activations_dev.py b/modules/02_activations/activations_dev.py
index 616f8879..6c5ffbdb 100644
--- a/modules/02_activations/activations_dev.py
+++ b/modules/02_activations/activations_dev.py
@@ -61,7 +61,7 @@ try:
     from tinytorch.core.tensor import Tensor
 except ImportError:
     # For development - import from local modules
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
     from tensor_dev import Tensor
 
 # In[ ]:
@@ -1203,7 +1203,7 @@ if __name__ == "__main__":
 # - **Industry Standard**: Every major ML framework prioritizes optimizing these specific activation functions
 
 # ### Next Steps
-# 1. **Export your module**: `tito module complete 03_activations`
+# 1. **Export your module**: `tito module complete 02_activations`
 # 2. **Validate integration**: `tito test --module activations`
 # 3. **Explore activation variants**: Experiment with Leaky ReLU or GELU implementations
 # 4. **Ready for Module 04**: Layers - combining your activations with linear transformations!
diff --git a/modules/03_layers/layers_dev.py b/modules/03_layers/layers_dev.py
index 65693736..4026037c 100644
--- a/modules/03_layers/layers_dev.py
+++ b/modules/03_layers/layers_dev.py
@@ -71,7 +71,7 @@ else:
     # Development: Import from local module files
     # During development, we need to import directly from the source files
     # This allows us to work with modules before they're packaged
-    tensor_module_path = os.path.join(os.path.dirname(__file__), '..', '02_tensor')
+    tensor_module_path = os.path.join(os.path.dirname(__file__), '..', '01_tensor')
     sys.path.insert(0, tensor_module_path)
     try:
         from tensor_dev import Tensor, Parameter
diff --git a/modules/04_losses/losses_dev.ipynb b/modules/04_losses/losses_dev.ipynb
new file mode 100644
index 00000000..8f7ab4fe
--- /dev/null
+++ b/modules/04_losses/losses_dev.ipynb
@@ -0,0 +1,2532 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "54a999b1",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Loss Functions - Learning Objectives Made Mathematical\n",
+    "\n",
+    "Welcome to Loss Functions! You'll implement the critical bridge between model predictions and learning objectives that makes neural network training possible.\n",
+    "\n",
+    "## 🔗 Building on Previous Learning\n",
+    "**What You Built Before**:\n",
+    "- Module 02 (Tensor): Data structures for predictions and targets\n",
+    "- Module 03 (Activations): Nonlinear transformations for model outputs\n",
+    "- Module 04 (Layers): Complete neural network layers that produce predictions\n",
+    "\n",
+    "**What's Working**: You can build networks that transform inputs into predictions!\n",
+    "\n",
+    "**The Gap**: Predictions aren't learning objectives - you need to measure how \"wrong\" predictions are and provide gradient signals for improvement.\n",
+    "\n",
+    "**This Module's Solution**: Implement MSE, CrossEntropy, and BinaryCrossEntropy loss functions with numerical stability.\n",
+    "\n",
+    "**Connection Map**:\n",
+    "```\n",
+    "Layers → Loss Functions → Gradients\n",
+    "(predictions)  (objectives)   (learning signals)\n",
+    "```\n",
+    "\n",
+    "## Learning Goals (Systems-Focused)\n",
+    "- **Systems understanding**: How loss functions translate business problems into optimization objectives with proper numerical stability\n",
+    "- **Core implementation skill**: Build production-quality loss functions with stable computation and efficient batch processing\n",
+    "- **Pattern mastery**: Understand how different loss functions shape learning dynamics and convergence behavior\n",
+    "- **Framework connections**: See how your implementations mirror PyTorch's loss functions and autograd integration patterns\n",
+    "- **Optimization trade-offs**: Learn why numerical stability and computational efficiency matter for reliable training at scale\n",
+    "\n",
+    "## Build → Use → Reflect\n",
+    "1. **Build**: Complete loss function implementations with numerical stability and gradient support\n",
+    "2. **Use**: Apply loss functions to regression and classification problems with real neural networks\n",
+    "3. **Reflect**: Why do different loss functions lead to different learning behaviors, and when does numerical stability matter?\n",
+    "\n",
+    "## What You'll Achieve\n",
+    "By implementing loss functions from scratch, you'll understand:\n",
+    "- Deep technical understanding of how loss functions quantify prediction quality and enable learning\n",
+    "- Practical capability to implement numerically stable loss computation for production ML systems\n",
+    "- Systems insight into computational complexity, memory requirements, and batch processing efficiency\n",
+    "- Performance awareness of how loss function choice affects training speed and convergence characteristics\n",
+    "- Production knowledge of how frameworks implement robust loss computation with proper error handling\n",
+    "\n",
+    "## Systems Reality Check\n",
+    "💡 **Production Context**: PyTorch's loss functions use numerically stable implementations and automatic mixed precision to handle extreme gradients and values\n",
+    "⚡ **Performance Insight**: Numerically unstable loss functions can cause training to fail catastrophically - proper implementation is critical for reliable ML systems"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfe05289",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "losses-imports",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| default_exp core.losses\n",
+    "\n",
+    "#| export\n",
+    "import numpy as np\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# Import our building blocks - try package first, then local modules\n",
+    "try:\n",
+    "    from tinytorch.core.tensor import Tensor\n",
+    "    # Note: For now, we'll use simplified implementations without full autograd\n",
+    "    # In a complete system, these would integrate with the autograd Variable system\n",
+    "except ImportError:\n",
+    "    # For development, import from local modules\n",
+    "    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))\n",
+    "    from tensor_dev import Tensor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0f986fc",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "losses-setup",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(\"🔥 TinyTorch Loss Functions Module\")\n",
+    "print(f\"NumPy version: {np.__version__}\")\n",
+    "print(f\"Python version: {sys.version_info.major}.{sys.version_info.minor}\")\n",
+    "print(\"Ready to build loss functions for neural network training!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "899f0152",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "## Where This Code Lives in the Final Package\n",
+    "\n",
+    "**Learning Side:** You work in modules/04_losses/losses_dev.py  \n",
+    "**Building Side:** Code exports to tinytorch.core.losses\n",
+    "\n",
+    "```python\n",
+    "# Final package structure:\n",
+    "from tinytorch.core.losses import MeanSquaredError, CrossEntropyLoss, BinaryCrossEntropyLoss  # All loss functions!\n",
+    "from tinytorch.core.tensor import Tensor  # The foundation\n",
+    "from tinytorch.core.layers import Linear, Sequential  # Network components\n",
+    "```\n",
+    "\n",
+    "**Why this matters:**\n",
+    "- **Learning:** Focused module for understanding loss functions and training objectives\n",
+    "- **Production:** Proper organization like PyTorch's torch.nn with all loss functions together\n",
+    "- **Consistency:** All loss functions live together in core.losses for easy access\n",
+    "- **Integration:** Works seamlessly with tensors and neural networks for complete training systems"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "409b9591",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Understanding Loss Functions in Neural Networks\n",
+    "\n",
+    "## What are Loss Functions?\n",
+    "\n",
+    "Loss functions are the mathematical bridge between what your model predicts and what you want it to learn. They quantify the \"distance\" between predictions and reality.\n",
+    "\n",
+    "```\n",
+    "Business Goal: \"Predict house prices accurately\"\n",
+    "            ↓\n",
+    "Mathematical Loss: MSE = (predicted_price - actual_price)²\n",
+    "            ↓  \n",
+    "Optimization Signal: gradient = 2 × (predicted - actual)\n",
+    "            ↓\n",
+    "Learning Update: parameter -= learning_rate × gradient\n",
+    "```\n",
+    "\n",
+    "## The Learning Ecosystem\n",
+    "\n",
+    "Loss functions provide four critical capabilities:\n",
+    "\n",
+    "🎯 **Learning Objectives**: Define what \"good\" performance means mathematically  \n",
+    "📈 **Gradient Signal**: Provide directional improvement information for parameters  \n",
+    "🔍 **Progress Measurement**: Enable monitoring training progress and convergence detection  \n",
+    "⚖️ **Trade-off Control**: Balance different aspects of model performance and regularization  \n",
+    "\n",
+    "## Visual Understanding: Loss Function Landscape\n",
+    "\n",
+    "```\n",
+    "Loss Function Behavior:\n",
+    "           MSE Loss                    CrossEntropy Loss\n",
+    "    High │    ╱╲                High │     ╱╲\n",
+    "         │   ╱  ╲                    │    ╱  ╲\n",
+    "         │  ╱    ╲                   │   ╱    ╲\n",
+    "         │ ╱      ╲                  │  ╱      ╲\n",
+    "     Low │╱        ╲              Low │ ╱        ╲\n",
+    "         └──────────────         └──────────────\n",
+    "         Wrong  Right              Wrong  Right\n",
+    "         \n",
+    "   • Smooth gradients          • Steep near wrong predictions\n",
+    "   • Quadratic penalty         • Gentle near correct predictions\n",
+    "   • Good for regression       • Good for classification\n",
+    "```\n",
+    "\n",
+    "Different loss functions create different optimization landscapes that affect how your model learns!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "429bbae2",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Mean Squared Error - Foundation for Regression\n",
+    "\n",
+    "MSE is the cornerstone loss function for regression problems. It measures prediction quality by penalizing large errors more than small ones.\n",
+    "\n",
+    "## Visual Understanding: MSE Behavior\n",
+    "\n",
+    "```\n",
+    "MSE Loss Visualization:\n",
+    "          \n",
+    "    Loss │     ╱╲\n",
+    "       4 │    ╱  ╲        • Error = 2 → Loss = 4\n",
+    "       3 │   ╱    ╲       • Error = 1 → Loss = 1\n",
+    "       2 │  ╱      ╲      • Error = 0 → Loss = 0\n",
+    "       1 │ ╱        ╲     • Quadratic penalty!\n",
+    "       0 │╱__________╲____\n",
+    "         -2  -1   0   1   2\n",
+    "              Error\n",
+    "              \n",
+    "Gradient Flow:\n",
+    "    ∂Loss/∂prediction = 2 × (predicted - actual)\n",
+    "    \n",
+    "    Large errors → Large gradients → Big updates\n",
+    "    Small errors → Small gradients → Fine tuning\n",
+    "```\n",
+    "\n",
+    "## Mathematical Foundation\n",
+    "\n",
+    "For batch of predictions and targets:\n",
+    "```\n",
+    "MSE = (1/n) × Σ(y_pred - y_true)²\n",
+    "\n",
+    "Gradient: ∂MSE/∂y_pred = (2/n) × (y_pred - y_true)\n",
+    "```\n",
+    "\n",
+    "## Learning Objectives\n",
+    "By implementing MSE, you'll understand:\n",
+    "- How regression loss functions translate continuous prediction errors into optimization signals\n",
+    "- Why squared error creates smooth, well-behaved gradients for stable optimization\n",
+    "- How batch processing enables efficient training on multiple samples simultaneously\n",
+    "- The connection between mathematical loss formulations and practical ML training dynamics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80f4f2d2",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "mse-concept-question",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "🤔 **Computational Question: MSE Properties**\n",
+    "\n",
+    "Before implementing, let's understand MSE behavior:\n",
+    "\n",
+    "1. If you predict house price as $300k but actual is $250k, what's the MSE?\n",
+    "2. If you predict $310k but actual is $250k, what's the MSE? \n",
+    "3. Which error gets penalized more heavily and why?\n",
+    "4. How does this relate to the quadratic penalty we visualized?\n",
+    "\n",
+    "This understanding will guide your implementation approach.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2533af31",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "mse-loss-implementation",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class MeanSquaredError:\n",
+    "    \"\"\"\n",
+    "    Mean Squared Error Loss for Regression Problems\n",
+    "    \n",
+    "    Computes the average squared difference between predictions and targets:\n",
+    "    MSE = (1/n) × Σ(y_pred - y_true)²\n",
+    "    \n",
+    "    Features:\n",
+    "    - Numerically stable computation\n",
+    "    - Efficient batch processing\n",
+    "    - Clean gradient properties for optimization\n",
+    "    - Compatible with tensor operations\n",
+    "    \n",
+    "    Example Usage:\n",
+    "        mse = MeanSquaredError()\n",
+    "        loss = mse(predictions, targets)  # Returns scalar loss value\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize MSE loss function.\"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def __call__(self, y_pred, y_true):\n",
+    "        \"\"\"\n",
+    "        Compute MSE loss between predictions and targets.\n",
+    "        \n",
+    "        Args:\n",
+    "            y_pred: Model predictions (Tensor, shape: [batch_size, ...])\n",
+    "            y_true: True targets (Tensor, shape: [batch_size, ...])\n",
+    "            \n",
+    "        Returns:\n",
+    "            Tensor with scalar loss value\n",
+    "            \n",
+    "        TODO: Implement MSE computation with proper tensor handling.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Convert inputs to tensors for consistent processing\n",
+    "        2. Compute element-wise prediction errors (differences)\n",
+    "        3. Square the errors to create quadratic penalty\n",
+    "        4. Take mean across all elements for final loss\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        >>> mse = MeanSquaredError()\n",
+    "        >>> pred = Tensor([[1.0, 2.0]])\n",
+    "        >>> true = Tensor([[1.5, 1.5]])\n",
+    "        >>> loss = mse(pred, true)\n",
+    "        >>> print(loss.data)\n",
+    "        0.25  # [(1.0-1.5)² + (2.0-1.5)²] / 2 = [0.25 + 0.25] / 2\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use np.mean() for efficient batch averaging\n",
+    "        - Element-wise operations work naturally with tensor.data\n",
+    "        - Return result wrapped in Tensor for consistent interface\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        # Step 1: Ensure we have tensor inputs for consistent processing\n",
+    "        if not isinstance(y_pred, Tensor):\n",
+    "            y_pred = Tensor(y_pred)\n",
+    "        if not isinstance(y_true, Tensor):\n",
+    "            y_true = Tensor(y_true)\n",
+    "        \n",
+    "        # Step 2: Compute mean squared error with element-wise operations\n",
+    "        prediction_errors = y_pred.data - y_true.data  # Element-wise difference\n",
+    "        squared_errors = prediction_errors * prediction_errors  # Element-wise squaring\n",
+    "        mean_loss = np.mean(squared_errors)  # Average across all elements\n",
+    "        \n",
+    "        return Tensor(mean_loss)\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def forward(self, y_pred, y_true):\n",
+    "        \"\"\"Alternative interface for forward pass.\"\"\"\n",
+    "        return self.__call__(y_pred, y_true)\n",
+    "\n",
+    "# 🔍 SYSTEMS INSIGHT: Gradient Landscape Visualization\n",
+    "def visualize_loss_landscapes():\n",
+    "    \"\"\"Visualize how different loss functions create different optimization landscapes.\"\"\"\n",
+    "    print(\"🔍 Loss Function Landscape Visualization\")\n",
+    "    print(\"=\" * 45)\n",
+    "\n",
+    "    try:\n",
+    "        import numpy as np\n",
+    "\n",
+    "        # Create prediction space for visualization\n",
+    "        prediction_range = np.linspace(-3, 3, 100)\n",
+    "        true_value = 0.0  # Target value\n",
+    "\n",
+    "        print(\"\\n📈 Loss Landscape Comparison:\")\n",
+    "        print(\"   How loss changes as predictions move away from target\")\n",
+    "\n",
+    "        # Calculate loss landscapes\n",
+    "        mse = MeanSquaredError()\n",
+    "        ce = CrossEntropyLoss()\n",
+    "        bce = BinaryCrossEntropyLoss()\n",
+    "\n",
+    "        # MSE landscape (regression)\n",
+    "        mse_losses = []\n",
+    "        for pred in prediction_range:\n",
+    "            loss = mse(Tensor([pred]), Tensor([true_value]))\n",
+    "            mse_losses.append(loss.data)\n",
+    "\n",
+    "        # Binary CE landscape (classification)\n",
+    "        bce_losses = []\n",
+    "        for pred in prediction_range:\n",
+    "            loss = bce(Tensor([pred]), Tensor([1.0]))  # Target: positive class\n",
+    "            bce_losses.append(loss.data)\n",
+    "\n",
+    "        # Find key gradient characteristics\n",
+    "        mse_gradient_at_zero = 2 * (0 - true_value)  # MSE gradient formula\n",
+    "        mse_gradient_at_one = 2 * (1 - true_value)\n",
+    "\n",
+    "        print(f\"\\n🎯 Gradient Behavior Analysis:\")\n",
+    "        print(f\"   MSE gradient at prediction=0: {mse_gradient_at_zero:.3f}\")\n",
+    "        print(f\"   MSE gradient at prediction=1: {mse_gradient_at_one:.3f}\")\n",
+    "        print(f\"   MSE provides linear gradient growth\")\n",
+    "\n",
+    "        # Binary CE gradient analysis\n",
+    "        sigmoid_at_zero = 1 / (1 + np.exp(-0))  # = 0.5\n",
+    "        bce_grad_at_zero = sigmoid_at_zero - 1.0  # = -0.5\n",
+    "        sigmoid_at_one = 1 / (1 + np.exp(-1))    # ≈ 0.73\n",
+    "        bce_grad_at_one = sigmoid_at_one - 1.0   # ≈ -0.27\n",
+    "\n",
+    "        print(f\"   BCE gradient at logit=0: {bce_grad_at_zero:.3f}\")\n",
+    "        print(f\"   BCE gradient at logit=1: {bce_grad_at_one:.3f}\")\n",
+    "        print(f\"   BCE provides adaptive gradient magnitude\")\n",
+    "\n",
+    "        # Visualize ASCII loss curves\n",
+    "        print(f\"\\n📊 Loss Function Shapes (ASCII visualization):\")\n",
+    "        print(f\"   Prediction range: {prediction_range[0]:.1f} to {prediction_range[-1]:.1f}\")\n",
+    "\n",
+    "        # Sample key points for visualization\n",
+    "        sample_points = [-2, -1, 0, 1, 2]\n",
+    "        print(f\"\\n   {'Prediction':>10} {'MSE Loss':>10} {'BCE Loss':>10} {'Gradient Type':>15}\")\n",
+    "        print(f\"   {'-'*10} {'-'*10} {'-'*10} {'-'*15}\")\n",
+    "\n",
+    "        for point in sample_points:\n",
+    "            mse_loss = mse(Tensor([point]), Tensor([0.0]))\n",
+    "            bce_loss = bce(Tensor([point]), Tensor([1.0]))\n",
+    "\n",
+    "            # Characterize gradient steepness\n",
+    "            if abs(point) < 0.5:\n",
+    "                grad_type = \"Gentle\"\n",
+    "            elif abs(point) < 1.5:\n",
+    "                grad_type = \"Moderate\"\n",
+    "            else:\n",
+    "                grad_type = \"Steep\"\n",
+    "\n",
+    "            print(f\"   {point:>10.1f} {mse_loss.data:>10.3f} {bce_loss.data:>10.3f} {grad_type:>15}\")\n",
+    "\n",
+    "        # Optimization implications\n",
+    "        print(f\"\\n🚀 Optimization Implications:\")\n",
+    "        print(f\"   MSE (Regression):\")\n",
+    "        print(f\"     • Quadratic penalty grows smoothly\")\n",
+    "        print(f\"     • Large errors → large gradients (aggressive correction)\")\n",
+    "        print(f\"     • Small errors → small gradients (fine-tuning)\")\n",
+    "        print(f\"     • Symmetric around target value\")\n",
+    "\n",
+    "        print(f\"   Binary CrossEntropy (Classification):\")\n",
+    "        print(f\"     • Logarithmic penalty creates adaptive gradients\")\n",
+    "        print(f\"     • Wrong confident predictions → steep gradients\")\n",
+    "        print(f\"     • Right confident predictions → gentle gradients\")\n",
+    "        print(f\"     • Asymmetric penalty structure encourages confidence\")\n",
+    "\n",
+    "        # 💡 WHY THIS MATTERS: Different loss landscapes create different\n",
+    "        # optimization dynamics. MSE's smooth quadratic surface enables\n",
+    "        # stable gradient descent, while CrossEntropy's adaptive gradients\n",
+    "        # help classification models learn faster from confident mistakes.\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Visualization error: {e}\")\n",
+    "        print(\"Ensure loss functions are implemented for landscape analysis\")\n",
+    "\n",
+    "# 🔍 SYSTEMS INSIGHT: MSE Computational Analysis\n",
+    "def analyze_mse_properties():\n",
+    "    \"\"\"Analyze MSE loss characteristics for systems understanding.\"\"\"\n",
+    "    print(\"🔍 MSE Loss Analysis - Understanding the Math\")\n",
+    "    print(\"=\" * 45)\n",
+    "    \n",
+    "    try:\n",
+    "        mse = MeanSquaredError()\n",
+    "        \n",
+    "        # Error magnitude vs loss relationship\n",
+    "        print(\"\\n📊 Error Magnitude vs Loss (Quadratic Penalty):\")\n",
+    "        errors = [0.1, 0.5, 1.0, 2.0, 5.0]\n",
+    "        for error in errors:\n",
+    "            pred = Tensor([error])\n",
+    "            true = Tensor([0.0])\n",
+    "            loss = mse(pred, true)\n",
+    "            print(f\"   Error: {error:4.1f} → Loss: {loss.data:8.3f} (× {loss.data/(error**2):5.1f} baseline)\")\n",
+    "        \n",
+    "        # Batch vs individual processing\n",
+    "        print(f\"\\n⚡ Batch Processing Efficiency:\")\n",
+    "        single_losses = []\n",
+    "        for i in range(100):\n",
+    "            pred = Tensor([np.random.randn()])\n",
+    "            true = Tensor([np.random.randn()])\n",
+    "            loss = mse(pred, true)\n",
+    "            single_losses.append(loss.data)\n",
+    "        \n",
+    "        # Batch version\n",
+    "        batch_pred = Tensor(np.random.randn(100))\n",
+    "        batch_true = Tensor(np.random.randn(100))\n",
+    "        batch_loss = mse(batch_pred, batch_true)\n",
+    "        \n",
+    "        individual_mean = np.mean(single_losses)\n",
+    "        print(f\"   Individual losses mean: {individual_mean:.6f}\")\n",
+    "        print(f\"   Batch loss:            {batch_loss.data:.6f}\")\n",
+    "        print(f\"   Difference:            {abs(individual_mean - batch_loss.data):.8f}\")\n",
+    "        \n",
+    "        # Memory efficiency analysis\n",
+    "        import sys\n",
+    "        small_tensor = Tensor([1.0])\n",
+    "        large_tensor = Tensor(np.random.randn(1000))\n",
+    "        \n",
+    "        print(f\"\\n💾 Memory Efficiency:\")\n",
+    "        print(f\"   Small loss memory: {sys.getsizeof(small_tensor.data)} bytes\")\n",
+    "        print(f\"   Large loss memory: {sys.getsizeof(large_tensor.data)} bytes\")\n",
+    "        print(f\"   MSE memory is independent of input size!\")\n",
+    "        \n",
+    "        # 💡 WHY THIS MATTERS: MSE provides stable, well-behaved gradients\n",
+    "        # that are proportional to error magnitude, making optimization smooth.\n",
+    "        # The quadratic penalty means large errors dominate learning initially,\n",
+    "        # then fine-tuning happens as errors get smaller.\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Analysis error: {e}\")\n",
+    "        print(\"Ensure MSE implementation is complete before running analysis\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0b9be9f",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: MSE Loss Computation\n",
+    "This test validates `MeanSquaredError.__call__`, ensuring correct MSE computation with various input types and batch sizes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39a9be44",
+   "metadata": {
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-mse-loss",
+     "locked": true,
+     "points": 3,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_unit_mse_loss():\n",
+    "    \"\"\"Test MSE loss implementation.\"\"\"\n",
+    "    print(\"🧪 Testing Mean Squared Error Loss...\")\n",
+    "    \n",
+    "    mse = MeanSquaredError()\n",
+    "    \n",
+    "    # Test case 1: Perfect predictions (loss should be 0)\n",
+    "    y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
+    "    y_true = Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
+    "    loss = mse(y_pred, y_true)\n",
+    "    assert abs(loss.data) < 1e-6, f\"Perfect predictions should have loss ≈ 0, got {loss.data}\"\n",
+    "    print(\"✅ Perfect predictions test passed\")\n",
+    "    \n",
+    "    # Test case 2: Known loss computation\n",
+    "    y_pred = Tensor([[1.0, 2.0]])\n",
+    "    y_true = Tensor([[0.0, 1.0]])\n",
+    "    loss = mse(y_pred, y_true)\n",
+    "    expected = 1.0  # [(1-0)² + (2-1)²] / 2 = [1 + 1] / 2 = 1.0\n",
+    "    assert abs(loss.data - expected) < 1e-6, f\"Expected loss {expected}, got {loss.data}\"\n",
+    "    print(\"✅ Known loss computation test passed\")\n",
+    "    \n",
+    "    # Test case 3: Batch processing\n",
+    "    y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
+    "    y_true = Tensor([[1.5, 2.5], [2.5, 3.5]])\n",
+    "    loss = mse(y_pred, y_true)\n",
+    "    expected = 0.25  # All squared differences are 0.25\n",
+    "    assert abs(loss.data - expected) < 1e-6, f\"Expected batch loss {expected}, got {loss.data}\"\n",
+    "    print(\"✅ Batch processing test passed\")\n",
+    "    \n",
+    "    # Test case 4: Single value\n",
+    "    y_pred = Tensor([5.0])\n",
+    "    y_true = Tensor([3.0])\n",
+    "    loss = mse(y_pred, y_true)\n",
+    "    expected = 4.0  # (5-3)² = 4\n",
+    "    assert abs(loss.data - expected) < 1e-6, f\"Expected single value loss {expected}, got {loss.data}\"\n",
+    "    print(\"✅ Single value test passed\")\n",
+    "    \n",
+    "    print(\"🎉 MSE loss tests passed! Understanding regression objectives.\")\n",
+    "\n",
+    "test_unit_mse_loss()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48e960ae",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Cross-Entropy Loss - Foundation for Multi-Class Classification\n",
+    "\n",
+    "Cross-Entropy Loss measures the \"information distance\" between predicted probability distributions and true class labels. It's the gold standard for classification problems.\n",
+    "\n",
+    "## Visual Understanding: Cross-Entropy Behavior\n",
+    "\n",
+    "```\n",
+    "Cross-Entropy Loss for 3-Class Problem:\n",
+    "\n",
+    "Class Probabilities after Softmax:\n",
+    "    Input: [2.0, 1.0, 0.1]    →    Probabilities: [0.66, 0.24, 0.10]\n",
+    "    True:  Class 0 (index 0)   →    Target:       [1.0,  0.0,  0.0]\n",
+    "    \n",
+    "Loss Computation:\n",
+    "    CE = -log(probability_of_correct_class)\n",
+    "    CE = -log(0.66) = 0.415\n",
+    "    \n",
+    "Intuition:\n",
+    "    High confidence + Correct → Low loss\n",
+    "    High confidence + Wrong   → High loss  \n",
+    "    Low confidence  + Any     → Medium loss\n",
+    "\n",
+    "Gradient Behavior:\n",
+    "    Wrong predictions → Steep gradients → Big corrections\n",
+    "    Right predictions → Gentle gradients → Fine tuning\n",
+    "```\n",
+    "\n",
+    "## Numerical Stability Challenge\n",
+    "\n",
+    "```\n",
+    "The Numerical Stability Problem:\n",
+    "    \n",
+    "    Raw logits: [50.0, 49.0, 48.0]\n",
+    "    Naive softmax: exp(50)/[exp(50)+exp(49)+exp(48)]\n",
+    "    Problem: exp(50) ≈ 5×10²¹ → Overflow!\n",
+    "    \n",
+    "Our Solution (Log-Sum-Exp Trick):\n",
+    "    1. max_val = max(logits) = 50.0\n",
+    "    2. stable_logits = [0.0, -1.0, -2.0]  # Subtract max\n",
+    "    3. exp([0.0, -1.0, -2.0]) = [1.0, 0.37, 0.14]\n",
+    "    4. Safe softmax: [0.67, 0.25, 0.09]\n",
+    "```\n",
+    "\n",
+    "## Mathematical Foundation\n",
+    "\n",
+    "For predictions and class indices:\n",
+    "```\n",
+    "CrossEntropy = -Σ y_true × log(softmax(y_pred))\n",
+    "\n",
+    "Softmax: softmax(x_i) = exp(x_i) / Σ exp(x_j)\n",
+    "Stable: softmax(x_i) = exp(x_i - max(x)) / Σ exp(x_j - max(x))\n",
+    "```\n",
+    "\n",
+    "## Learning Objectives\n",
+    "By implementing Cross-Entropy, you'll understand:\n",
+    "- How classification losses work with probability distributions and information theory\n",
+    "- Why softmax normalization creates proper probability distributions for multi-class problems\n",
+    "- The critical importance of numerical stability in exponential and logarithmic computations\n",
+    "- How cross-entropy naturally encourages confident, correct predictions through its gradient structure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22a7ac21",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "crossentropy-concept-question",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "🤔 **Computational Question: CrossEntropy Stability**\n",
+    "\n",
+    "Consider numerical stability in cross-entropy:\n",
+    "\n",
+    "1. What happens if you compute exp(100) directly?\n",
+    "2. Why does subtracting the maximum value prevent overflow?\n",
+    "3. What happens if log(0) occurs during loss computation?\n",
+    "4. How does epsilon clipping prevent this issue?\n",
+    "\n",
+    "Understanding these edge cases is crucial for reliable implementation.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b638a54b",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "crossentropy-loss-implementation",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class CrossEntropyLoss:\n",
+    "    \"\"\"\n",
+    "    Cross-Entropy Loss for Multi-Class Classification Problems\n",
+    "    \n",
+    "    Computes the cross-entropy between predicted probability distributions\n",
+    "    and true class labels with numerically stable implementation.\n",
+    "    \n",
+    "    Features:\n",
+    "    - Numerically stable softmax computation using log-sum-exp trick\n",
+    "    - Support for both class indices and one-hot encoding\n",
+    "    - Efficient batch processing with proper broadcasting\n",
+    "    - Automatic handling of edge cases and extreme values\n",
+    "    \n",
+    "    Example Usage:\n",
+    "        ce_loss = CrossEntropyLoss()\n",
+    "        loss = ce_loss(logits, class_indices)  # Returns scalar loss value\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize CrossEntropy loss function.\"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def __call__(self, y_pred, y_true):\n",
+    "        \"\"\"\n",
+    "        Compute CrossEntropy loss between predictions and targets.\n",
+    "        \n",
+    "        Args:\n",
+    "            y_pred: Model predictions/logits (Tensor, shape: [batch_size, num_classes])\n",
+    "            y_true: True class indices (Tensor, shape: [batch_size]) or one-hot encoding\n",
+    "            \n",
+    "        Returns:\n",
+    "            Tensor with scalar loss value\n",
+    "            \n",
+    "        TODO: Implement CrossEntropy with numerically stable softmax computation.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Convert inputs to tensors and handle single samples\n",
+    "        2. Apply log-sum-exp trick for numerically stable softmax\n",
+    "        3. Clip probabilities to prevent log(0) issues\n",
+    "        4. Compute cross-entropy based on target format (indices vs one-hot)\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        >>> ce = CrossEntropyLoss()\n",
+    "        >>> logits = Tensor([[2.0, 1.0, 0.0]])  # Raw model outputs\n",
+    "        >>> targets = Tensor([0])  # Class 0 is correct\n",
+    "        >>> loss = ce(logits, targets)\n",
+    "        >>> print(loss.data)\n",
+    "        0.407  # -log(softmax([2.0, 1.0, 0.0])[0])\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use np.max(axis=1, keepdims=True) for stable max computation\n",
+    "        - Use np.clip(probabilities, 1e-15, 1.0-1e-15) to prevent log(0)\n",
+    "        - Handle both index format [0,1,2] and one-hot format [[1,0,0], [0,1,0]]\n",
+    "        - Use advanced indexing: probs[np.arange(batch_size), class_indices]\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        # Step 1: Ensure we have tensor inputs for consistent processing\n",
+    "        if not isinstance(y_pred, Tensor):\n",
+    "            y_pred = Tensor(y_pred)  # Convert predictions to tensor format\n",
+    "        if not isinstance(y_true, Tensor):\n",
+    "            y_true = Tensor(y_true)  # Convert targets to tensor format\n",
+    "        \n",
+    "        # Step 1: Extract numpy arrays for computation\n",
+    "        prediction_logits = y_pred.data  # Raw model outputs (pre-softmax)\n",
+    "        target_labels = y_true.data      # True class indices or one-hot vectors\n",
+    "        \n",
+    "        # Step 2: Handle both single predictions and batches consistently\n",
+    "        if prediction_logits.ndim == 1:\n",
+    "            prediction_logits = prediction_logits.reshape(1, -1)  # Convert to batch format [1, num_classes]\n",
+    "            \n",
+    "        # Step 3: Apply numerically stable softmax transformation\n",
+    "        # Subtract max to prevent overflow: exp(x-max) is equivalent but stable\n",
+    "        max_logits = np.max(prediction_logits, axis=1, keepdims=True)\n",
+    "        exp_pred = np.exp(prediction_logits - max_logits)\n",
+    "        softmax_pred = exp_pred / np.sum(exp_pred, axis=1, keepdims=True)\n",
+    "        \n",
+    "        # Step 4: Prevent numerical instability in log computation\n",
+    "        epsilon = 1e-15  # Small value to prevent log(0) → -inf and log(1) → 0 issues\n",
+    "        softmax_pred = np.clip(softmax_pred, epsilon, 1.0 - epsilon)\n",
+    "        \n",
+    "        # Step 5: Compute cross-entropy loss based on target format\n",
+    "        if len(target_labels.shape) == 1:\n",
+    "            # Format A: y_true contains class indices [0, 1, 2, ...]\n",
+    "            batch_size = target_labels.shape[0]\n",
+    "            # Extract probabilities for correct classes using advanced indexing\n",
+    "            correct_class_probs = softmax_pred[np.arange(batch_size), target_labels.astype(int)]\n",
+    "            log_probs = np.log(correct_class_probs)\n",
+    "            loss_value = -np.mean(log_probs)  # Negative log-likelihood\n",
+    "        else:\n",
+    "            # Format B: y_true is one-hot encoded [[1,0,0], [0,1,0], ...]\n",
+    "            log_probs = np.log(softmax_pred)\n",
+    "            # Multiply one-hot targets with log probabilities, sum across classes\n",
+    "            weighted_log_probs = target_labels * log_probs\n",
+    "            loss_value = -np.mean(np.sum(weighted_log_probs, axis=1))\n",
+    "        \n",
+    "        return Tensor(loss_value)\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def forward(self, y_pred, y_true):\n",
+    "        \"\"\"Alternative interface for forward pass.\"\"\"\n",
+    "        return self.__call__(y_pred, y_true)\n",
+    "\n",
+    "# 🔍 SYSTEMS INSIGHT: CrossEntropy Stability Analysis\n",
+    "def analyze_crossentropy_stability():\n",
+    "    \"\"\"Analyze numerical stability in cross-entropy computation.\"\"\"\n",
+    "    print(\"🔍 CrossEntropy Stability Analysis\")\n",
+    "    print(\"=\" * 40)\n",
+    "    \n",
+    "    try:\n",
+    "        ce = CrossEntropyLoss()\n",
+    "        \n",
+    "        # Test numerical stability with extreme values\n",
+    "        print(\"\\n⚡ Numerical Stability Testing:\")\n",
+    "        \n",
+    "        # Extreme logits that would overflow in naive implementation\n",
+    "        extreme_logits = Tensor([[100.0, 99.0, 98.0]])\n",
+    "        safe_labels = Tensor([0])\n",
+    "        \n",
+    "        loss = ce(extreme_logits, safe_labels)\n",
+    "        print(f\"   Extreme logits [100, 99, 98]: Loss = {loss.data:.6f}\")\n",
+    "        print(f\"   No overflow or NaN: {not np.isnan(loss.data) and not np.isinf(loss.data)}\")\n",
+    "        \n",
+    "        # Test epsilon clipping effectiveness\n",
+    "        print(f\"\\n🛡️ Epsilon Clipping Protection:\")\n",
+    "        very_confident = Tensor([[10.0, -10.0, -10.0]])  # Very confident about class 0\n",
+    "        confident_labels = Tensor([0])\n",
+    "        \n",
+    "        loss = ce(very_confident, confident_labels)\n",
+    "        print(f\"   Very confident correct prediction: Loss = {loss.data:.6f}\")\n",
+    "        print(f\"   Should be near 0: {loss.data < 0.01}\")\n",
+    "        \n",
+    "        # Compare different confidence levels\n",
+    "        print(f\"\\n📊 Confidence vs Loss Relationship:\")\n",
+    "        confidence_levels = [\n",
+    "            (\"Low confidence\", [[0.1, 0.0, -0.1]]),\n",
+    "            (\"Medium confidence\", [[1.0, 0.0, -1.0]]),\n",
+    "            (\"High confidence\", [[5.0, 0.0, -5.0]]),\n",
+    "            (\"Very high\", [[10.0, 0.0, -10.0]])\n",
+    "        ]\n",
+    "        \n",
+    "        for name, logits in confidence_levels:\n",
+    "            test_logits = Tensor(logits)\n",
+    "            test_loss = ce(test_logits, Tensor([0]))\n",
+    "            print(f\"   {name:15}: Loss = {test_loss.data:.6f}\")\n",
+    "        \n",
+    "        # Memory efficiency for large vocabularies\n",
+    "        print(f\"\\n💾 Memory Scaling Analysis:\")\n",
+    "        small_vocab = Tensor(np.random.randn(32, 100))    # 100 classes\n",
+    "        large_vocab = Tensor(np.random.randn(32, 10000))  # 10k classes\n",
+    "        \n",
+    "        import sys\n",
+    "        small_memory = sys.getsizeof(small_vocab.data)\n",
+    "        large_memory = sys.getsizeof(large_vocab.data)\n",
+    "        \n",
+    "        print(f\"   Small vocab (100 classes): {small_memory / 1024:.1f} KB\")\n",
+    "        print(f\"   Large vocab (10k classes): {large_memory / 1024:.1f} KB\")\n",
+    "        print(f\"   Memory scales O(batch_size × num_classes)\")\n",
+    "        \n",
+    "        # 💡 WHY THIS MATTERS: CrossEntropy memory scales with vocabulary size.\n",
+    "        # This is why large language models use techniques like hierarchical softmax\n",
+    "        # or sampling-based training to handle vocabularies with 50k+ tokens.\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Analysis error: {e}\")\n",
+    "        print(\"Ensure CrossEntropy implementation is complete\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31b5abca",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Cross-Entropy Loss Computation\n",
+    "This test validates `CrossEntropyLoss.__call__`, ensuring correct cross-entropy computation with numerically stable softmax."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6062489",
+   "metadata": {
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-crossentropy-loss",
+     "locked": true,
+     "points": 4,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_unit_crossentropy_loss():\n",
+    "    \"\"\"Test CrossEntropy loss implementation.\"\"\"\n",
+    "    print(\"🧪 Testing Cross-Entropy Loss...\")\n",
+    "    \n",
+    "    ce = CrossEntropyLoss()\n",
+    "    \n",
+    "    # Test case 1: Perfect predictions\n",
+    "    y_pred = Tensor([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0]])  # Very confident correct predictions\n",
+    "    y_true = Tensor([0, 1])  # Class indices\n",
+    "    loss = ce(y_pred, y_true)\n",
+    "    assert loss.data < 0.1, f\"Perfect predictions should have low loss, got {loss.data}\"\n",
+    "    print(\"✅ Perfect predictions test passed\")\n",
+    "    \n",
+    "    # Test case 2: Random predictions (should have higher loss)\n",
+    "    y_pred = Tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])  # Uniform after softmax\n",
+    "    y_true = Tensor([0, 1])\n",
+    "    loss = ce(y_pred, y_true)\n",
+    "    expected_random = -np.log(1.0/3.0)  # log(1/num_classes) for uniform distribution\n",
+    "    assert abs(loss.data - expected_random) < 0.1, f\"Random predictions should have loss ≈ {expected_random}, got {loss.data}\"\n",
+    "    print(\"✅ Random predictions test passed\")\n",
+    "    \n",
+    "    # Test case 3: Binary classification\n",
+    "    y_pred = Tensor([[2.0, 1.0], [1.0, 2.0]])\n",
+    "    y_true = Tensor([0, 1])\n",
+    "    loss = ce(y_pred, y_true)\n",
+    "    assert 0.0 < loss.data < 2.0, f\"Binary classification loss should be reasonable, got {loss.data}\"\n",
+    "    print(\"✅ Binary classification test passed\")\n",
+    "    \n",
+    "    # Test case 4: One-hot encoded labels\n",
+    "    y_pred = Tensor([[2.0, 1.0, 0.0], [0.0, 2.0, 1.0]])\n",
+    "    y_true = Tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])  # One-hot encoded\n",
+    "    loss = ce(y_pred, y_true)\n",
+    "    assert 0.0 < loss.data < 2.0, f\"One-hot encoded loss should be reasonable, got {loss.data}\"\n",
+    "    print(\"✅ One-hot encoded labels test passed\")\n",
+    "    \n",
+    "    print(\"🎉 Cross-Entropy loss tests passed! Understanding classification objectives.\")\n",
+    "\n",
+    "test_unit_crossentropy_loss()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13e8a85c",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Binary Cross-Entropy Loss - Optimized for Binary Classification\n",
+    "\n",
+    "Binary Cross-Entropy Loss is the specialized, efficient version of cross-entropy for binary (two-class) problems. It's more stable and faster than using regular cross-entropy with 2 classes.\n",
+    "\n",
+    "## Visual Understanding: Binary Cross-Entropy\n",
+    "\n",
+    "```\n",
+    "Binary Classification Landscape:\n",
+    "\n",
+    "Sigmoid Activation:\n",
+    "    Raw Logit → Sigmoid → Probability → Loss\n",
+    "    -5.0     → 0.007   → 0.007       → High loss (if true=1)\n",
+    "     0.0     → 0.500   → 0.500       → Medium loss\n",
+    "    +5.0     → 0.993   → 0.993       → Low loss (if true=1)\n",
+    "\n",
+    "Loss Behavior:\n",
+    "    BCE = -[y×log(p) + (1-y)×log(1-p)]\n",
+    "    \n",
+    "    For y=1 (positive class):\n",
+    "        p=0.9 → -log(0.9) = 0.105  (low loss)\n",
+    "        p=0.1 → -log(0.1) = 2.303  (high loss)\n",
+    "    \n",
+    "    For y=0 (negative class):\n",
+    "        p=0.1 → -log(0.9) = 0.105  (low loss)  \n",
+    "        p=0.9 → -log(0.1) = 2.303  (high loss)\n",
+    "```\n",
+    "\n",
+    "## Numerical Stability Solution\n",
+    "\n",
+    "```\n",
+    "The Binary Cross-Entropy Stability Problem:\n",
+    "    \n",
+    "    BCE = -[y×log(σ(x)) + (1-y)×log(1-σ(x))]\n",
+    "    \n",
+    "    Where σ(x) = 1/(1+exp(-x))\n",
+    "    \n",
+    "    Problems:\n",
+    "    - Large positive x: exp(-x) → 0, then log(1) → 0 (loss of precision)\n",
+    "    - Large negative x: σ(x) → 0, then log(0) → -∞\n",
+    "    \n",
+    "Our Stable Solution:\n",
+    "    BCE = max(x,0) - x×y + log(1 + exp(-|x|))\n",
+    "    \n",
+    "    Why this works:\n",
+    "    - max(x,0) handles positive values\n",
+    "    - -x×y is the \"cross\" term  \n",
+    "    - log(1+exp(-|x|)) is always stable (exp≤1)\n",
+    "```\n",
+    "\n",
+    "## Mathematical Foundation\n",
+    "\n",
+    "For binary predictions and labels:\n",
+    "```\n",
+    "BCE = -y × log(σ(x)) - (1-y) × log(1-σ(x))\n",
+    "\n",
+    "Stable form: BCE = max(x,0) - x×y + log(1 + exp(-|x|))\n",
+    "```\n",
+    "\n",
+    "## Learning Objectives\n",
+    "By implementing Binary Cross-Entropy, you'll understand:\n",
+    "- How binary classification creates simpler optimization landscapes than multi-class problems\n",
+    "- Why sigmoid activation naturally pairs with binary cross-entropy loss through its gradient structure\n",
+    "- The critical importance of numerically stable formulations for reliable production training\n",
+    "- How specialized binary losses achieve better efficiency and stability than general solutions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b7f8af9",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "binary-crossentropy-concept",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "🤔 **Computational Question: Binary Stability**\n",
+    "\n",
+    "Consider the stable BCE formulation:\n",
+    "\n",
+    "1. Why does max(x,0) - x×y + log(1+exp(-|x|)) work?\n",
+    "2. What happens when x=100? (trace through the computation)\n",
+    "3. What happens when x=-100? (trace through the computation)\n",
+    "4. How does this prevent both overflow and underflow?\n",
+    "\n",
+    "This mathematical insight is crucial for production systems.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c53864df",
+   "metadata": {
+    "lines_to_next_cell": 1,
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "binary-crossentropy-implementation",
+     "locked": false,
+     "schema_version": 3,
+     "solution": true,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class BinaryCrossEntropyLoss:\n",
+    "    \"\"\"\n",
+    "    Binary Cross-Entropy Loss for Binary Classification Problems\n",
+    "    \n",
+    "    Computes binary cross-entropy between predictions and binary labels\n",
+    "    with numerically stable sigmoid + BCE implementation.\n",
+    "    \n",
+    "    Features:\n",
+    "    - Numerically stable computation from logits using stable BCE formula\n",
+    "    - Efficient batch processing with vectorized operations\n",
+    "    - Automatic sigmoid application through stable formulation\n",
+    "    - Robust to extreme input values without overflow/underflow\n",
+    "    \n",
+    "    Example Usage:\n",
+    "        bce_loss = BinaryCrossEntropyLoss()\n",
+    "        loss = bce_loss(logits, binary_labels)  # Returns scalar loss value\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize Binary CrossEntropy loss function.\"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def __call__(self, y_pred, y_true):\n",
+    "        \"\"\"\n",
+    "        Compute Binary CrossEntropy loss between predictions and targets.\n",
+    "        \n",
+    "        Args:\n",
+    "            y_pred: Model predictions/logits (Tensor, shape: [batch_size, 1] or [batch_size])\n",
+    "            y_true: True binary labels (Tensor, shape: [batch_size, 1] or [batch_size])\n",
+    "            \n",
+    "        Returns:\n",
+    "            Tensor with scalar loss value\n",
+    "            \n",
+    "        TODO: Implement stable binary cross-entropy using the logits formulation.\n",
+    "        \n",
+    "        APPROACH:\n",
+    "        1. Convert inputs to tensors and flatten for consistent processing\n",
+    "        2. Use stable BCE formula: max(x,0) - x×y + log(1+exp(-|x|))\n",
+    "        3. Apply this formula element-wise across the batch\n",
+    "        4. Return mean loss across all samples\n",
+    "        \n",
+    "        EXAMPLE:\n",
+    "        >>> bce = BinaryCrossEntropyLoss()\n",
+    "        >>> logits = Tensor([[2.0], [-1.0]])  # Raw outputs\n",
+    "        >>> labels = Tensor([[1.0], [0.0]])   # Binary targets\n",
+    "        >>> loss = bce(logits, labels)\n",
+    "        >>> print(loss.data)\n",
+    "        0.693  # Stable computation of binary cross-entropy\n",
+    "        \n",
+    "        HINTS:\n",
+    "        - Use np.maximum(logits, 0) for the max(x,0) term\n",
+    "        - Use np.abs(logits) to ensure exp argument is ≤ 0\n",
+    "        - The formula naturally handles both positive and negative logits\n",
+    "        - Return np.mean() for batch averaging\n",
+    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
+    "        # Step 1: Ensure we have tensor inputs for consistent processing\n",
+    "        if not isinstance(y_pred, Tensor):\n",
+    "            y_pred = Tensor(y_pred)  # Convert predictions to tensor format\n",
+    "        if not isinstance(y_true, Tensor):\n",
+    "            y_true = Tensor(y_true)  # Convert targets to tensor format\n",
+    "        \n",
+    "        # Get flat arrays for computation\n",
+    "        logits = y_pred.data.flatten()\n",
+    "        labels = y_true.data.flatten()\n",
+    "        \n",
+    "        # Step 1: Define numerically stable binary cross-entropy computation\n",
+    "        def stable_bce_with_logits(logits, labels):\n",
+    "            \"\"\"\n",
+    "            Numerically stable BCE using the logits formulation:\n",
+    "            BCE(logits, y) = max(logits, 0) - logits * y + log(1 + exp(-|logits|))\n",
+    "            \n",
+    "            This formulation prevents:\n",
+    "            - exp(large_positive_logit) → overflow\n",
+    "            - log(very_small_sigmoid) → -inf\n",
+    "            \n",
+    "            Mathematical equivalence:\n",
+    "            - For positive logits: x - x*y + log(1 + exp(-x))\n",
+    "            - For negative logits: -x*y + log(1 + exp(x))\n",
+    "            \"\"\"\n",
+    "            # Step 1a: Handle positive logits to prevent exp(large_positive) overflow\n",
+    "            positive_part = np.maximum(logits, 0)\n",
+    "            \n",
+    "            # Step 1b: Subtract logit-label product (the \"cross\" in cross-entropy)\n",
+    "            cross_term = logits * labels\n",
+    "            \n",
+    "            # Step 1c: Add log(1 + exp(-|logits|)) for numerical stability\n",
+    "            # Using abs(logits) ensures the exponent is always negative or zero\n",
+    "            stability_term = np.log(1 + np.exp(-np.abs(logits)))\n",
+    "            \n",
+    "            return positive_part - cross_term + stability_term\n",
+    "        \n",
+    "        # Step 2: Apply stable BCE computation across the batch\n",
+    "        individual_losses = stable_bce_with_logits(logits, labels)\n",
+    "        mean_loss = np.mean(individual_losses)  # Average loss across batch\n",
+    "        \n",
+    "        return Tensor(mean_loss)\n",
+    "        ### END SOLUTION\n",
+    "    \n",
+    "    def forward(self, y_pred, y_true):\n",
+    "        \"\"\"Alternative interface for forward pass.\"\"\"\n",
+    "        return self.__call__(y_pred, y_true)\n",
+    "\n",
+    "# 🔍 SYSTEMS INSIGHT: Binary CrossEntropy Efficiency Analysis\n",
+    "def analyze_binary_crossentropy_efficiency():\n",
+    "    \"\"\"Analyze binary cross-entropy computational efficiency.\"\"\"\n",
+    "    print(\"🔍 Binary CrossEntropy Efficiency Analysis\")\n",
+    "    print(\"=\" * 45)\n",
+    "    \n",
+    "    try:\n",
+    "        bce = BinaryCrossEntropyLoss()\n",
+    "        ce = CrossEntropyLoss()  # For comparison\n",
+    "        \n",
+    "        # Compare binary-specific vs general cross-entropy\n",
+    "        print(\"\\n⚡ Binary vs Multi-Class Efficiency:\")\n",
+    "        \n",
+    "        # Binary problem solved two ways\n",
+    "        binary_logits = Tensor([[1.5], [-0.8], [2.1]])\n",
+    "        binary_labels = Tensor([[1.0], [0.0], [1.0]])\n",
+    "        \n",
+    "        # Method 1: Binary CrossEntropy\n",
+    "        binary_loss = bce(binary_logits, binary_labels)\n",
+    "        \n",
+    "        # Method 2: 2-class CrossEntropy (equivalent but less efficient)\n",
+    "        multiclass_logits = Tensor([[1.5, 0.0], [-0.8, 0.0], [2.1, 0.0]])\n",
+    "        multiclass_labels = Tensor([0, 1, 0])  # Convert to class indices\n",
+    "        multiclass_loss = ce(multiclass_logits, multiclass_labels)\n",
+    "        \n",
+    "        print(f\"   Binary CE Loss:     {binary_loss.data:.6f}\")\n",
+    "        print(f\"   2-Class CE Loss:    {multiclass_loss.data:.6f}\")\n",
+    "        print(f\"   Difference:         {abs(binary_loss.data - multiclass_loss.data):.8f}\")\n",
+    "        \n",
+    "        # Memory efficiency comparison\n",
+    "        print(f\"\\n💾 Memory Efficiency Comparison:\")\n",
+    "        \n",
+    "        batch_size = 1000\n",
+    "        binary_memory = batch_size * 1 * 8  # 1 value per sample, 8 bytes per float64\n",
+    "        multiclass_memory = batch_size * 2 * 8  # 2 classes, 8 bytes per float64\n",
+    "        \n",
+    "        print(f\"   Binary approach:    {binary_memory / 1024:.1f} KB\")\n",
+    "        print(f\"   Multi-class (2):    {multiclass_memory / 1024:.1f} KB\")\n",
+    "        print(f\"   Binary is {multiclass_memory/binary_memory:.1f}× more memory efficient\")\n",
+    "        \n",
+    "        # Stability test with extreme values\n",
+    "        print(f\"\\n🛡️ Extreme Value Stability:\")\n",
+    "        extreme_tests = [\n",
+    "            (\"Large positive\", [[100.0]], [[1.0]]),\n",
+    "            (\"Large negative\", [[-100.0]], [[0.0]]),\n",
+    "            (\"Mixed extreme\", [[100.0], [-100.0]], [[1.0], [0.0]])\n",
+    "        ]\n",
+    "        \n",
+    "        for name, logits, labels in extreme_tests:\n",
+    "            test_logits = Tensor(logits)\n",
+    "            test_labels = Tensor(labels)\n",
+    "            loss = bce(test_logits, test_labels)\n",
+    "            is_stable = not (np.isnan(loss.data) or np.isinf(loss.data))\n",
+    "            print(f\"   {name:15}: Loss = {loss.data:.6f}, Stable = {is_stable}\")\n",
+    "        \n",
+    "        # 💡 WHY THIS MATTERS: Binary CrossEntropy is 2× more memory efficient\n",
+    "        # than regular CrossEntropy for binary problems, and provides better\n",
+    "        # numerical stability through its specialized formulation.\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Analysis error: {e}\")\n",
+    "        print(\"Ensure BinaryCrossEntropy implementation is complete\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd8abd01",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Unit Test: Binary Cross-Entropy Loss\n",
+    "This test validates `BinaryCrossEntropyLoss.__call__`, ensuring stable binary cross-entropy computation with extreme values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "400a7568",
+   "metadata": {
+    "nbgrader": {
+     "grade": true,
+     "grade_id": "test-binary-crossentropy",
+     "locked": true,
+     "points": 4,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_unit_binary_crossentropy_loss():\n",
+    "    \"\"\"Test Binary CrossEntropy loss implementation.\"\"\"\n",
+    "    print(\"🧪 Testing Binary Cross-Entropy Loss...\")\n",
+    "    \n",
+    "    bce = BinaryCrossEntropyLoss()\n",
+    "    \n",
+    "    # Test case 1: Perfect predictions\n",
+    "    y_pred = Tensor([[10.0], [-10.0]])  # Very confident correct predictions\n",
+    "    y_true = Tensor([[1.0], [0.0]])\n",
+    "    loss = bce(y_pred, y_true)\n",
+    "    assert loss.data < 0.1, f\"Perfect predictions should have low loss, got {loss.data}\"\n",
+    "    print(\"✅ Perfect predictions test passed\")\n",
+    "    \n",
+    "    # Test case 2: Random predictions (should have higher loss)\n",
+    "    y_pred = Tensor([[0.0], [0.0]])  # 0.5 probability after sigmoid\n",
+    "    y_true = Tensor([[1.0], [0.0]])\n",
+    "    loss = bce(y_pred, y_true)\n",
+    "    expected_random = -np.log(0.5)  # log(0.5) for random guessing\n",
+    "    assert abs(loss.data - expected_random) < 0.1, f\"Random predictions should have loss ≈ {expected_random}, got {loss.data}\"\n",
+    "    print(\"✅ Random predictions test passed\")\n",
+    "    \n",
+    "    # Test case 3: Batch processing\n",
+    "    y_pred = Tensor([[1.0], [2.0], [-1.0]])\n",
+    "    y_true = Tensor([[1.0], [1.0], [0.0]])\n",
+    "    loss = bce(y_pred, y_true)\n",
+    "    assert 0.0 < loss.data < 2.0, f\"Batch processing loss should be reasonable, got {loss.data}\"\n",
+    "    print(\"✅ Batch processing test passed\")\n",
+    "    \n",
+    "    # Test case 4: Extreme values (test numerical stability)\n",
+    "    y_pred = Tensor([[100.0], [-100.0]])  # Extreme logits\n",
+    "    y_true = Tensor([[1.0], [0.0]])\n",
+    "    loss = bce(y_pred, y_true)\n",
+    "    assert not np.isnan(loss.data) and not np.isinf(loss.data), f\"Extreme values should not cause NaN/Inf, got {loss.data}\"\n",
+    "    assert loss.data < 1.0, f\"Extreme correct predictions should have low loss, got {loss.data}\"\n",
+    "    print(\"✅ Extreme values test passed\")\n",
+    "    \n",
+    "    print(\"🎉 Binary Cross-Entropy loss tests passed! Understanding binary objectives.\")\n",
+    "\n",
+    "test_unit_binary_crossentropy_loss()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13b3bd16",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Custom Loss Functions - Aligning with Business Objectives\n",
+    "\n",
+    "Beyond standard loss functions, production ML systems often need custom losses that align with specific business objectives and domain constraints.\n",
+    "\n",
+    "## Business-Aligned Loss Design Patterns\n",
+    "\n",
+    "### Asymmetric Loss Functions\n",
+    "When false positives and false negatives have different costs:\n",
+    "\n",
+    "```python\n",
+    "# Medical diagnosis: False negatives (missing disease) cost 10× more\n",
+    "class AsymmetricBinaryCrossEntropy(BinaryCrossEntropyLoss):\n",
+    "    def __init__(self, false_negative_weight=10.0):\n",
+    "        super().__init__()\n",
+    "        self.fn_weight = false_negative_weight\n",
+    "\n",
+    "    def __call__(self, y_pred, y_true):\n",
+    "        # Standard BCE\n",
+    "        base_loss = super().__call__(y_pred, y_true)\n",
+    "\n",
+    "        # Weight false negatives more heavily\n",
+    "        # When y_true=1 and y_pred is low, increase penalty\n",
+    "        sigmoid_pred = 1 / (1 + np.exp(-y_pred.data))\n",
+    "        fn_penalty = y_true.data * (1 - sigmoid_pred) * self.fn_weight\n",
+    "\n",
+    "        weighted_loss = base_loss.data + np.mean(fn_penalty)\n",
+    "        return Tensor(weighted_loss)\n",
+    "```\n",
+    "\n",
+    "### Focal Loss for Imbalanced Data\n",
+    "Addresses class imbalance by focusing on hard examples:\n",
+    "\n",
+    "```python\n",
+    "class FocalLoss(CrossEntropyLoss):\n",
+    "    def __init__(self, alpha=1.0, gamma=2.0):\n",
+    "        super().__init__()\n",
+    "        self.alpha = alpha  # Class balance weight\n",
+    "        self.gamma = gamma  # Focusing parameter\n",
+    "\n",
+    "    def __call__(self, y_pred, y_true):\n",
+    "        # Get standard cross-entropy\n",
+    "        ce_loss = super().__call__(y_pred, y_true)\n",
+    "\n",
+    "        # Calculate softmax probabilities\n",
+    "        max_logits = np.max(y_pred.data, axis=1, keepdims=True)\n",
+    "        stable_logits = y_pred.data - max_logits\n",
+    "        exp_logits = np.exp(stable_logits)\n",
+    "        softmax_probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)\n",
+    "\n",
+    "        # Get probability of correct class\n",
+    "        batch_size = y_true.data.shape[0]\n",
+    "        correct_probs = softmax_probs[np.arange(batch_size), y_true.data.astype(int)]\n",
+    "\n",
+    "        # Apply focal loss formula: -α(1-p)^γ log(p)\n",
+    "        focal_weight = self.alpha * ((1 - correct_probs) ** self.gamma)\n",
+    "        focal_loss = focal_weight * ce_loss.data\n",
+    "\n",
+    "        return Tensor(np.mean(focal_loss))\n",
+    "```\n",
+    "\n",
+    "### Ranking-Aware Loss\n",
+    "For problems where order matters (search, recommendations):\n",
+    "\n",
+    "```python\n",
+    "class RankingAwareLoss:\n",
+    "    def __init__(self, position_weights=None):\n",
+    "        # Higher weights for top positions\n",
+    "        self.position_weights = position_weights or [10.0, 5.0, 2.0, 1.0, 0.5]\n",
+    "\n",
+    "    def __call__(self, predictions, targets, positions):\n",
+    "        \"\"\"predictions: relevance scores, targets: true relevance, positions: result positions\"\"\"\n",
+    "        mse = MeanSquaredError()\n",
+    "\n",
+    "        # Weight errors by position importance\n",
+    "        weighted_errors = []\n",
+    "        for pred, target, pos in zip(predictions.data, targets.data, positions.data):\n",
+    "            pos_weight = self.position_weights[min(int(pos), len(self.position_weights)-1)]\n",
+    "            error = ((pred - target) ** 2) * pos_weight\n",
+    "            weighted_errors.append(error)\n",
+    "\n",
+    "        return Tensor(np.mean(weighted_errors))\n",
+    "```\n",
+    "\n",
+    "## Advanced Custom Loss Patterns\n",
+    "\n",
+    "### Multi-Task Learning Loss\n",
+    "Combining multiple objectives with learned weights:\n",
+    "\n",
+    "```python\n",
+    "class MultiTaskLoss:\n",
+    "    def __init__(self, num_tasks=3):\n",
+    "        # Learnable loss weights (log-variance parameterization for stability)\n",
+    "        self.log_vars = [0.0] * num_tasks\n",
+    "\n",
+    "    def __call__(self, predictions_list, targets_list):\n",
+    "        \"\"\"predictions_list: [task1_preds, task2_preds, ...]\"\"\"\n",
+    "        total_loss = 0\n",
+    "\n",
+    "        for i, (preds, targets) in enumerate(zip(predictions_list, targets_list)):\n",
+    "            # Choose appropriate loss for each task\n",
+    "            if i == 0:  # Regression task\n",
+    "                task_loss = MeanSquaredError()(preds, targets)\n",
+    "            else:  # Classification tasks\n",
+    "                task_loss = CrossEntropyLoss()(preds, targets)\n",
+    "\n",
+    "            # Uncertainty-weighted combination\n",
+    "            precision = np.exp(-self.log_vars[i])\n",
+    "            weighted_loss = precision * task_loss.data + self.log_vars[i]\n",
+    "            total_loss += weighted_loss\n",
+    "\n",
+    "        return Tensor(total_loss)\n",
+    "```\n",
+    "\n",
+    "### Contrastive Loss for Similarity Learning\n",
+    "For learning embeddings and similarity:\n",
+    "\n",
+    "```python\n",
+    "class ContrastiveLoss:\n",
+    "    def __init__(self, margin=1.0):\n",
+    "        self.margin = margin\n",
+    "\n",
+    "    def __call__(self, embeddings1, embeddings2, labels):\n",
+    "        \"\"\"labels: 1 for similar pairs, 0 for dissimilar\"\"\"\n",
+    "        # Euclidean distance between embeddings\n",
+    "        distances = np.sqrt(np.sum((embeddings1.data - embeddings2.data) ** 2, axis=1))\n",
+    "\n",
+    "        # Contrastive loss formula\n",
+    "        positive_loss = labels.data * (distances ** 2)\n",
+    "        negative_loss = (1 - labels.data) * np.maximum(0, self.margin - distances) ** 2\n",
+    "\n",
+    "        total_loss = 0.5 * (positive_loss + negative_loss)\n",
+    "        return Tensor(np.mean(total_loss))\n",
+    "```\n",
+    "\n",
+    "## Custom Loss Implementation Guidelines\n",
+    "\n",
+    "### Numerical Stability Considerations\n",
+    "```python\n",
+    "# Always include stability measures in custom losses\n",
+    "class StableCustomLoss:\n",
+    "    def __call__(self, predictions, targets):\n",
+    "        # 1. Input validation\n",
+    "        if not isinstance(predictions, Tensor):\n",
+    "            predictions = Tensor(predictions)\n",
+    "\n",
+    "        # 2. Handle edge cases\n",
+    "        predictions_clipped = np.clip(predictions.data, -100, 100)  # Prevent overflow\n",
+    "\n",
+    "        # 3. Use numerically stable formulations\n",
+    "        # Avoid: exp(large_number), log(small_number)\n",
+    "        # Use: log-sum-exp trick, epsilon clipping\n",
+    "\n",
+    "        # 4. Return tensor for consistency\n",
+    "        return Tensor(computed_loss)\n",
+    "```\n",
+    "\n",
+    "### Gradient-Friendly Design\n",
+    "```python\n",
+    "# Ensure gradients flow properly\n",
+    "class GradientFriendlyLoss:\n",
+    "    def __call__(self, predictions, targets):\n",
+    "        # Avoid operations that create zero gradients:\n",
+    "        # - Hard thresholding: use soft approximations\n",
+    "        # - Discrete operations: use continuous relaxations\n",
+    "        # - Large plateaus: ensure non-zero gradients everywhere\n",
+    "\n",
+    "        # Good: Smooth, differentiable operations\n",
+    "        smooth_loss = self.smooth_l1_loss(predictions, targets)\n",
+    "        return smooth_loss\n",
+    "\n",
+    "    def smooth_l1_loss(self, pred, target, beta=1.0):\n",
+    "        \"\"\"Smooth L1 loss - less sensitive to outliers than MSE\"\"\"\n",
+    "        diff = np.abs(pred.data - target.data)\n",
+    "        loss = np.where(diff < beta,\n",
+    "                       0.5 * diff * diff / beta,\n",
+    "                       diff - 0.5 * beta)\n",
+    "        return Tensor(np.mean(loss))\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e84c5945",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Loss Function Application Guide and Comparison\n",
+    "\n",
+    "## When to Use Each Loss Function\n",
+    "\n",
+    "Understanding which loss function to use is critical for successful ML projects:\n",
+    "\n",
+    "### Mean Squared Error (MSE) - Regression Problems\n",
+    "```\n",
+    "Use when: Predicting continuous values\n",
+    "Examples: House prices, temperature, stock values, ages\n",
+    "Output: Any real number\n",
+    "Activation: Usually none (linear output)\n",
+    "Penalty: Quadratic (large errors >> small errors)\n",
+    "\n",
+    "Model Architecture:\n",
+    "Input → Hidden Layers → Linear Output → MSE Loss\n",
+    "```\n",
+    "\n",
+    "### Cross-Entropy Loss - Multi-Class Classification  \n",
+    "```\n",
+    "Use when: Choosing one class from 3+ options\n",
+    "Examples: Image classification, text categorization, medical diagnosis\n",
+    "Output: Probability distribution (sums to 1)\n",
+    "Activation: Softmax\n",
+    "Penalty: Logarithmic (encouraging confident correct predictions)\n",
+    "\n",
+    "Model Architecture:\n",
+    "Input → Hidden Layers → Softmax → CrossEntropy Loss\n",
+    "```\n",
+    "\n",
+    "### Binary Cross-Entropy Loss - Binary Classification\n",
+    "```\n",
+    "Use when: Binary decisions (yes/no, positive/negative)\n",
+    "Examples: Spam detection, fraud detection, medical screening\n",
+    "Output: Single probability (0 to 1)\n",
+    "Activation: Sigmoid\n",
+    "Penalty: Asymmetric (confident wrong predictions heavily penalized)\n",
+    "\n",
+    "Model Architecture:\n",
+    "Input → Hidden Layers → Sigmoid → Binary CrossEntropy Loss\n",
+    "```\n",
+    "\n",
+    "## Performance and Stability Comparison\n",
+    "\n",
+    "```\n",
+    "Computational Characteristics:\n",
+    "                      MSE    CrossEntropy    Binary CE\n",
+    "Time Complexity:     O(n)      O(n×c)        O(n)\n",
+    "Memory Complexity:   O(1)      O(n×c)        O(n)\n",
+    "Numerical Stability: High      Medium        High\n",
+    "Convergence Speed:   Fast      Medium        Fast\n",
+    "\n",
+    "Where: n = batch size, c = number of classes\n",
+    "```\n",
+    "\n",
+    "## Integration with Neural Networks\n",
+    "\n",
+    "```python\n",
+    "# Example training setup for different problem types:\n",
+    "\n",
+    "# Regression Problem (House Price Prediction)\n",
+    "regression_model = Sequential([\n",
+    "    Linear(10, 64),   # Input features → Hidden\n",
+    "    ReLU(),\n",
+    "    Linear(64, 1),    # Hidden → Single output\n",
+    "    # No activation - linear output for regression\n",
+    "])\n",
+    "loss_fn = MeanSquaredError()\n",
+    "\n",
+    "# Multi-Class Classification (Image Recognition)\n",
+    "classification_model = Sequential([\n",
+    "    Linear(784, 128), # Flattened image → Hidden\n",
+    "    ReLU(),\n",
+    "    Linear(128, 10),  # Hidden → 10 classes\n",
+    "    Softmax()         # Convert to probabilities\n",
+    "])\n",
+    "loss_fn = CrossEntropyLoss()\n",
+    "\n",
+    "# Binary Classification (Spam Detection)\n",
+    "binary_model = Sequential([\n",
+    "    Linear(100, 64),  # Text features → Hidden\n",
+    "    ReLU(),\n",
+    "    Linear(64, 1),    # Hidden → Single output\n",
+    "    Sigmoid()         # Convert to probability\n",
+    "])\n",
+    "loss_fn = BinaryCrossEntropyLoss()\n",
+    "\n",
+    "# Training loop pattern (same for all):\n",
+    "for batch in dataloader:\n",
+    "    predictions = model(batch.inputs)\n",
+    "    loss = loss_fn(predictions, batch.targets)\n",
+    "    # loss.backward()  # Compute gradients (when autograd is available)\n",
+    "    # optimizer.step() # Update parameters\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91ce7d95",
+   "metadata": {
+    "cell_marker": "\"\"\"",
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "### 🧪 Comprehensive Integration Test\n",
+    "This test validates all loss functions work together correctly and can be used interchangeably in production systems."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9df44d7b",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "comprehensive-loss-tests",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def test_unit_comprehensive_loss_integration():\n",
+    "    \"\"\"Test all loss functions work correctly together.\"\"\"\n",
+    "    print(\"🔬 Comprehensive Loss Function Integration Testing\")\n",
+    "    print(\"=\" * 55)\n",
+    "    \n",
+    "    # Test 1: All losses can be instantiated\n",
+    "    print(\"\\n1. Loss Function Instantiation:\")\n",
+    "    mse = MeanSquaredError()\n",
+    "    ce = CrossEntropyLoss()\n",
+    "    bce = BinaryCrossEntropyLoss()\n",
+    "    print(\"   ✅ All loss functions created successfully\")\n",
+    "    \n",
+    "    # Test 2: Loss functions return appropriate types\n",
+    "    print(\"\\n2. Return Type Verification:\")\n",
+    "    \n",
+    "    # MSE test\n",
+    "    pred = Tensor([[1.0, 2.0]])\n",
+    "    target = Tensor([[1.0, 2.0]])\n",
+    "    loss = mse(pred, target)\n",
+    "    assert isinstance(loss, Tensor), \"MSE should return Tensor\"\n",
+    "    assert loss.data.shape == (), \"MSE should return scalar\"\n",
+    "    \n",
+    "    # Cross-entropy test\n",
+    "    pred = Tensor([[1.0, 2.0], [2.0, 1.0]])\n",
+    "    target = Tensor([1, 0])\n",
+    "    loss = ce(pred, target)\n",
+    "    assert isinstance(loss, Tensor), \"CrossEntropy should return Tensor\"\n",
+    "    assert loss.data.shape == (), \"CrossEntropy should return scalar\"\n",
+    "    \n",
+    "    # Binary cross-entropy test\n",
+    "    pred = Tensor([[1.0], [-1.0]])\n",
+    "    target = Tensor([[1.0], [0.0]])\n",
+    "    loss = bce(pred, target)\n",
+    "    assert isinstance(loss, Tensor), \"Binary CrossEntropy should return Tensor\"\n",
+    "    assert loss.data.shape == (), \"Binary CrossEntropy should return scalar\"\n",
+    "    \n",
+    "    print(\"   ✅ All loss functions return correct types\")\n",
+    "    \n",
+    "    # Test 3: Loss values are reasonable\n",
+    "    print(\"\\n3. Loss Value Sanity Checks:\")\n",
+    "    \n",
+    "    # All losses should be non-negative\n",
+    "    assert mse.forward(Tensor([1.0]), Tensor([2.0])).data >= 0, \"MSE should be non-negative\"\n",
+    "    assert ce.forward(Tensor([[1.0, 0.0]]), Tensor([0])).data >= 0, \"CrossEntropy should be non-negative\"\n",
+    "    assert bce.forward(Tensor([1.0]), Tensor([1.0])).data >= 0, \"Binary CrossEntropy should be non-negative\"\n",
+    "    \n",
+    "    print(\"   ✅ All loss functions produce reasonable values\")\n",
+    "    \n",
+    "    # Test 4: Perfect predictions give low loss\n",
+    "    print(\"\\n4. Perfect Prediction Tests:\")\n",
+    "    \n",
+    "    perfect_mse = mse(Tensor([5.0]), Tensor([5.0]))\n",
+    "    perfect_ce = ce(Tensor([[10.0, 0.0]]), Tensor([0]))\n",
+    "    perfect_bce = bce(Tensor([10.0]), Tensor([1.0]))\n",
+    "    \n",
+    "    assert perfect_mse.data < 1e-10, f\"Perfect MSE should be ~0, got {perfect_mse.data}\"\n",
+    "    assert perfect_ce.data < 0.1, f\"Perfect CE should be low, got {perfect_ce.data}\"\n",
+    "    assert perfect_bce.data < 0.1, f\"Perfect BCE should be low, got {perfect_bce.data}\"\n",
+    "    \n",
+    "    print(\"   ✅ Perfect predictions produce low loss\")\n",
+    "    \n",
+    "    print(\"\\n🎉 All comprehensive integration tests passed!\")\n",
+    "    print(\"   • Loss functions instantiate correctly\")\n",
+    "    print(\"   • Return types are consistent (Tensor scalars)\")\n",
+    "    print(\"   • Loss values are mathematically sound\")\n",
+    "    print(\"   • Perfect predictions are handled correctly\")\n",
+    "    print(\"   • Ready for integration with neural network training!\")\n",
+    "\n",
+    "test_unit_comprehensive_loss_integration()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f2c082c",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "# Systems Analysis: Loss Function Performance and Engineering\n",
+    "\n",
+    "Let's analyze loss functions from an ML systems engineering perspective, focusing on performance, memory usage, and production implications.\n",
+    "\n",
+    "## Computational Complexity Deep Dive\n",
+    "\n",
+    "```\n",
+    "Algorithmic Analysis by Loss Type:\n",
+    "\n",
+    "MSE (Mean Squared Error):\n",
+    "    Time: O(n) - linear in number of predictions\n",
+    "    Space: O(1) - constant additional memory\n",
+    "    Operations: n subtractions + n multiplications + 1 mean\n",
+    "    Bottleneck: Memory bandwidth (simple arithmetic operations)\n",
+    "    \n",
+    "CrossEntropy (Multi-Class):\n",
+    "    Time: O(n×c) - linear in samples × classes  \n",
+    "    Space: O(n×c) - store full probability distributions\n",
+    "    Operations: n×c exp + n×c divisions + n×c logs + reductions\n",
+    "    Bottleneck: Exponential computations and memory bandwidth\n",
+    "    \n",
+    "Binary CrossEntropy:\n",
+    "    Time: O(n) - linear in number of samples\n",
+    "    Space: O(n) - store one probability per sample\n",
+    "    Operations: n max + n multiplications + n exp + n logs\n",
+    "    Bottleneck: Transcendental functions (exp, log)\n",
+    "```\n",
+    "\n",
+    "## Memory Scaling Analysis\n",
+    "\n",
+    "Understanding memory requirements is crucial for large-scale training:\n",
+    "\n",
+    "```\n",
+    "Memory Requirements by Problem Scale:\n",
+    "\n",
+    "Small Problem (1K samples, 100 classes):\n",
+    "    MSE:         8 KB (1K samples × 8 bytes)\n",
+    "    CrossEntropy: 800 KB (1K × 100 × 8 bytes)\n",
+    "    Binary CE:   16 KB (1K × 2 × 8 bytes)\n",
+    "\n",
+    "Large Problem (100K samples, 10K classes):\n",
+    "    MSE:         800 KB (independent of classes!)\n",
+    "    CrossEntropy: 8 GB (memory bottleneck)\n",
+    "    Binary CE:   1.6 MB (scales with samples only)\n",
+    "\n",
+    "Production Scale (1M samples, 50K vocab):\n",
+    "    MSE:         8 MB\n",
+    "    CrossEntropy: 400 GB (requires distributed memory)\n",
+    "    Binary CE:   16 MB\n",
+    "```\n",
+    "\n",
+    "## Numerical Stability Engineering Analysis\n",
+    "\n",
+    "Production systems must handle edge cases robustly:\n",
+    "\n",
+    "```\n",
+    "Stability Challenges and Solutions:\n",
+    "\n",
+    "CrossEntropy Stability Issues:\n",
+    "    Problem: exp(large_logit) → overflow → NaN gradients\n",
+    "    Solution: log-sum-exp trick with max subtraction\n",
+    "    \n",
+    "    Problem: log(very_small_prob) → -∞ → training collapse\n",
+    "    Solution: epsilon clipping (1e-15 to 1-1e-15)\n",
+    "    \n",
+    "Binary CrossEntropy Stability Issues:\n",
+    "    Problem: sigmoid(large_positive) → 1.0 → log(0) issues\n",
+    "    Solution: stable logits formulation bypasses sigmoid\n",
+    "    \n",
+    "    Problem: exp(large_negative) in naive implementation\n",
+    "    Solution: max(x,0) - x*y + log(1+exp(-|x|)) formulation\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c48c075d",
+   "metadata": {
+    "lines_to_next_cell": 1
+   },
+   "source": [
+    "\"\"\"\n",
+    "# Production Performance Benchmarks\n",
+    "\n",
+    "Real-world performance characteristics matter for deployment:\n",
+    "\n",
+    "```\n",
+    "Inference Throughput (measured on modern hardware):\n",
+    "    MSE:              ~100M predictions/second\n",
+    "    CrossEntropy:     ~10M predictions/second  \n",
+    "    Binary CrossEntropy: ~80M predictions/second\n",
+    "\n",
+    "Training Memory Bandwidth Requirements:\n",
+    "    MSE:         ~800 MB/s (lightweight computation)\n",
+    "    CrossEntropy: ~80 GB/s (10× higher due to softmax!)\n",
+    "    Binary CE:   ~1.6 GB/s (moderate requirements)\n",
+    "\n",
+    "Gradient Computation Overhead:\n",
+    "    MSE:         1.1× forward pass time (simple derivatives)\n",
+    "    CrossEntropy: 1.5× forward pass time (softmax gradients)\n",
+    "    Binary CE:   1.2× forward pass time (sigmoid gradients)\n",
+    "```\n",
+    "\n",
+    "# Framework Integration and Production Patterns\n",
+    "\n",
+    "Understanding how production systems implement these concepts:\n",
+    "\n",
+    "```\n",
+    "PyTorch Implementation Patterns:\n",
+    "    torch.nn.MSELoss() - Direct implementation, minimal overhead\n",
+    "    torch.nn.CrossEntropyLoss() - Fused softmax+CE for efficiency\n",
+    "    torch.nn.BCEWithLogitsLoss() - Stable logits formulation\n",
+    "    \n",
+    "TensorFlow Implementation Patterns:\n",
+    "    tf.keras.losses.MeanSquaredError() - Vectorized operations\n",
+    "    tf.keras.losses.SparseCategoricalCrossentropy() - Memory efficient\n",
+    "    tf.keras.losses.BinaryCrossentropy() - From logits option\n",
+    "    \n",
+    "Production Optimizations:\n",
+    "    - Mixed precision (FP16) for memory efficiency\n",
+    "    - Gradient accumulation for large batch simulation\n",
+    "    - Loss scaling to prevent underflow in mixed precision\n",
+    "    - Checkpointing to trade memory for computation\n",
+    "```\n",
+    "\n",
+    "# Edge Device and Deployment Considerations\n",
+    "\n",
+    "Loss function choice affects deployment feasibility:\n",
+    "\n",
+    "```\n",
+    "Edge Device Constraints:\n",
+    "    Memory-limited (phones, IoT): Prefer Binary CE > MSE > CrossEntropy\n",
+    "    CPU-only inference: MSE has best compute efficiency\n",
+    "    Real-time requirements: Binary classification most predictable\n",
+    "    \n",
+    "Distributed Training Challenges:\n",
+    "    CrossEntropy: Requires all-reduce across all classes (expensive!)\n",
+    "    Gradient accumulation: MSE linear, CrossEntropy non-linear dependencies\n",
+    "    Mixed precision: Different overflow handling per loss type\n",
+    "    \n",
+    "Monitoring and Debugging:\n",
+    "    MSE divergence: Explodes quadratically (easy to detect)\n",
+    "    CrossEntropy divergence: More gradual degradation  \n",
+    "    BCE monitoring: Natural bounded behavior aids debugging\n",
+    "```\n",
+    "\"\"\"\n",
+    "\n",
+    "🔍 SYSTEMS INSIGHT: Performance Profiling Analysis\n",
+    "def analyze_loss_performance_characteristics():\n",
+    "    \"\"\"Comprehensive performance analysis of all loss functions.\"\"\"\n",
+    "    print(\"🔍 Loss Function Performance Analysis\")\n",
+    "    print(\"=\" * 45)\n",
+    "    \n",
+    "    try:\n",
+    "        import time\n",
+    "        \n",
+    "        # Initialize loss functions\n",
+    "        mse = MeanSquaredError()\n",
+    "        ce = CrossEntropyLoss()\n",
+    "        bce = BinaryCrossEntropyLoss()\n",
+    "        \n",
+    "        print(\"\\n⚡ Computational Complexity Measurement:\")\n",
+    "        \n",
+    "        # Test different batch sizes to see scaling behavior\n",
+    "        batch_sizes = [100, 1000, 10000]\n",
+    "        \n",
+    "        for batch_size in batch_sizes:\n",
+    "            print(f\"\\n   Batch size: {batch_size:,}\")\n",
+    "            \n",
+    "            # MSE timing\n",
+    "            mse_pred = Tensor(np.random.randn(batch_size, 10))\n",
+    "            mse_true = Tensor(np.random.randn(batch_size, 10))\n",
+    "            \n",
+    "            start = time.perf_counter()\n",
+    "            for _ in range(100):  # Average over multiple runs\n",
+    "                mse_loss = mse(mse_pred, mse_true)\n",
+    "            mse_time = (time.perf_counter() - start) / 100\n",
+    "            \n",
+    "            # CrossEntropy timing\n",
+    "            ce_pred = Tensor(np.random.randn(batch_size, 100))  # 100 classes\n",
+    "            ce_true = Tensor(np.random.randint(0, 100, batch_size))\n",
+    "            \n",
+    "            start = time.perf_counter()\n",
+    "            for _ in range(100):\n",
+    "                ce_loss = ce(ce_pred, ce_true)\n",
+    "            ce_time = (time.perf_counter() - start) / 100\n",
+    "            \n",
+    "            # Binary CrossEntropy timing\n",
+    "            bce_pred = Tensor(np.random.randn(batch_size, 1))\n",
+    "            bce_true = Tensor(np.random.randint(0, 2, (batch_size, 1)).astype(float))\n",
+    "            \n",
+    "            start = time.perf_counter()\n",
+    "            for _ in range(100):\n",
+    "                bce_loss = bce(bce_pred, bce_true)\n",
+    "            bce_time = (time.perf_counter() - start) / 100\n",
+    "            \n",
+    "            print(f\"      MSE:         {mse_time*1000:8.3f} ms\")\n",
+    "            print(f\"      CrossEntropy: {ce_time*1000:8.3f} ms\")\n",
+    "            print(f\"      Binary CE:    {bce_time*1000:8.3f} ms\")\n",
+    "            print(f\"      CE/MSE ratio: {ce_time/mse_time:8.1f}x\")\n",
+    "        \n",
+    "        print(\"\\n💾 Memory Efficiency Analysis:\")\n",
+    "        \n",
+    "        # Compare memory usage for different problem sizes\n",
+    "        problem_configs = [\n",
+    "            (\"Small (1K samples, 10 classes)\", 1000, 10),\n",
+    "            (\"Medium (10K samples, 100 classes)\", 10000, 100),\n",
+    "            (\"Large (100K samples, 1K classes)\", 100000, 1000)\n",
+    "        ]\n",
+    "        \n",
+    "        for name, samples, classes in problem_configs:\n",
+    "            print(f\"\\n   {name}:\")\n",
+    "            \n",
+    "            # Memory calculations (bytes)\n",
+    "            mse_memory = samples * 8  # One value per sample\n",
+    "            ce_memory = samples * classes * 8  # Full probability distribution\n",
+    "            bce_memory = samples * 8  # One probability per sample\n",
+    "            \n",
+    "            print(f\"      MSE memory:    {mse_memory / 1024 / 1024:8.1f} MB\")\n",
+    "            print(f\"      CE memory:     {ce_memory / 1024 / 1024:8.1f} MB\") \n",
+    "            print(f\"      BCE memory:    {bce_memory / 1024 / 1024:8.1f} MB\")\n",
+    "            print(f\"      CE overhead:   {ce_memory/mse_memory:8.1f}x\")\n",
+    "        \n",
+    "        # 💡 WHY THIS MATTERS: These performance characteristics determine\n",
+    "        # which loss functions are feasible for different deployment scenarios.\n",
+    "        # CrossEntropy's O(n×c) memory scaling makes it prohibitive for \n",
+    "        # large vocabularies without specialized techniques.\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Performance analysis error: {e}\")\n",
+    "        print(\"Performance analysis requires complete implementations\")\n",
+    "\n",
+    "🔍 SYSTEMS INSIGHT: Numerical Stability Deep Analysis\n",
+    "def analyze_numerical_stability_edge_cases():\n",
+    "    \"\"\"Deep analysis of numerical stability across all loss functions.\"\"\"\n",
+    "    print(\"🔍 Numerical Stability Edge Case Analysis\")\n",
+    "    print(\"=\" * 50)\n",
+    "    \n",
+    "    try:\n",
+    "        mse = MeanSquaredError()\n",
+    "        ce = CrossEntropyLoss()\n",
+    "        bce = BinaryCrossEntropyLoss()\n",
+    "        \n",
+    "        print(\"\\n🛡️ Extreme Value Stability Testing:\")\n",
+    "        \n",
+    "        # Test extreme values that could cause numerical issues\n",
+    "        extreme_tests = [\n",
+    "            (\"Huge positive\", 1e10),\n",
+    "            (\"Huge negative\", -1e10),\n",
+    "            (\"Tiny positive\", 1e-10),\n",
+    "            (\"NaN input\", float('nan')),\n",
+    "            (\"Infinity\", float('inf')),\n",
+    "            (\"Negative infinity\", float('-inf'))\n",
+    "        ]\n",
+    "        \n",
+    "        for name, value in extreme_tests:\n",
+    "            print(f\"\\n   Testing {name} ({value}):\")\n",
+    "            \n",
+    "            # MSE stability\n",
+    "            try:\n",
+    "                mse_loss = mse(Tensor([value]), Tensor([0.0]))\n",
+    "                mse_stable = not (np.isnan(mse_loss.data) or np.isinf(mse_loss.data))\n",
+    "                print(f\"      MSE stable:    {mse_stable} (loss: {mse_loss.data:.3e})\")\n",
+    "            except:\n",
+    "                print(f\"      MSE stable:    False (exception)\")\n",
+    "            \n",
+    "            # CrossEntropy stability  \n",
+    "            try:\n",
+    "                ce_loss = ce(Tensor([[value, 0.0, 0.0]]), Tensor([0]))\n",
+    "                ce_stable = not (np.isnan(ce_loss.data) or np.isinf(ce_loss.data))\n",
+    "                print(f\"      CE stable:     {ce_stable} (loss: {ce_loss.data:.3e})\")\n",
+    "            except:\n",
+    "                print(f\"      CE stable:     False (exception)\")\n",
+    "            \n",
+    "            # Binary CrossEntropy stability\n",
+    "            try:\n",
+    "                bce_loss = bce(Tensor([value]), Tensor([1.0]))\n",
+    "                bce_stable = not (np.isnan(bce_loss.data) or np.isinf(bce_loss.data))\n",
+    "                print(f\"      BCE stable:    {bce_stable} (loss: {bce_loss.data:.3e})\")\n",
+    "            except:\n",
+    "                print(f\"      BCE stable:    False (exception)\")\n",
+    "        \n",
+    "        print(\"\\n🔬 Gradient Behavior Analysis:\")\n",
+    "        \n",
+    "        # Analyze gradient magnitudes for different prediction qualities\n",
+    "        confidence_levels = [\n",
+    "            (\"Very wrong\", [[-5.0, 5.0, 0.0]], [0]),  # Predict class 1, actual class 0\n",
+    "            (\"Slightly wrong\", [[-0.5, 0.5, 0.0]], [0]),\n",
+    "            (\"Uncertain\", [[0.0, 0.0, 0.0]], [0]), \n",
+    "            (\"Slightly right\", [[0.5, -0.5, 0.0]], [0]),\n",
+    "            (\"Very right\", [[5.0, -5.0, 0.0]], [0])\n",
+    "        ]\n",
+    "        \n",
+    "        print(\"      Prediction Quality → CrossEntropy Loss:\")\n",
+    "        for name, logits, labels in confidence_levels:\n",
+    "            loss = ce(Tensor(logits), Tensor(labels))\n",
+    "            print(f\"      {name:15}: {loss.data:8.4f}\")\n",
+    "        \n",
+    "        # 💡 WHY THIS MATTERS: Understanding how loss functions behave\n",
+    "        # at extremes helps debug training failures and choose appropriate\n",
+    "        # loss scaling and clipping strategies for production systems.\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Stability analysis error: {e}\")\n",
+    "        print(\"Stability analysis requires complete implementations\")\n",
+    "\n",
+    "🔍 SYSTEMS INSIGHT: Mixed Precision Training Analysis\n",
+    "def analyze_mixed_precision_considerations():\n",
+    "    \"\"\"Analyze loss function behavior with FP16 mixed precision training.\"\"\"\n",
+    "    print(\"🔍 Mixed Precision Training Analysis\")\n",
+    "    print(\"=\" * 40)\n",
+    "\n",
+    "    try:\n",
+    "        print(\"\\n⚡ FP16 Numerical Range Analysis:\")\n",
+    "        print(\"   FP16 range: ~±65,504 (much smaller than FP32's ~±3.4×10³⁸)\")\n",
+    "\n",
+    "        # Simulate FP16 range limitations\n",
+    "        fp16_max = 65504.0\n",
+    "        fp16_min_normal = 2**-14  # Smallest normal FP16 number ≈ 6.1×10⁻⁵\n",
+    "\n",
+    "        print(f\"   FP16 maximum: ±{fp16_max:,.0f}\")\n",
+    "        print(f\"   FP16 min normal: {fp16_min_normal:.2e}\")\n",
+    "        print(f\"   Risk: Gradients/losses exceeding range → infinity/NaN\")\n",
+    "\n",
+    "        mse = MeanSquaredError()\n",
+    "        ce = CrossEntropyLoss()\n",
+    "        bce = BinaryCrossEntropyLoss()\n",
+    "\n",
+    "        print(f\"\\n🎯 Loss Function Mixed Precision Compatibility:\")\n",
+    "\n",
+    "        # Test cases that might overflow in FP16\n",
+    "        test_cases = [\n",
+    "            (\"Small values\", 1.0, 1.1),\n",
+    "            (\"Medium values\", 100.0, 110.0),\n",
+    "            (\"Large values\", 1000.0, 1100.0),\n",
+    "            (\"FP16 edge\", 200.0, 250.0)  # Could cause issues when squared\n",
+    "        ]\n",
+    "\n",
+    "        print(f\"\\n   {'Test Case':>15} {'MSE Loss':>12} {'FP16 Safe?':>12}\")\n",
+    "        print(f\"   {'-'*15} {'-'*12} {'-'*12}\")\n",
+    "\n",
+    "        for name, pred, true in test_cases:\n",
+    "            mse_loss = mse(Tensor([pred]), Tensor([true]))\n",
+    "            squared_error = (pred - true) ** 2\n",
+    "            fp16_safe = squared_error < fp16_max\n",
+    "\n",
+    "            print(f\"   {name:>15} {mse_loss.data:>12.1f} {'✅' if fp16_safe else '❌':>12}\")\n",
+    "\n",
+    "        print(f\"\\n🛡️ Mixed Precision Loss Scaling Strategy:\")\n",
+    "\n",
+    "        # Demonstrate loss scaling concept\n",
+    "        loss_scales = [1.0, 128.0, 1024.0, 8192.0]\n",
+    "        base_loss = 0.01  # Small loss that might underflow\n",
+    "\n",
+    "        print(f\"   {'Scale Factor':>12} {'Scaled Loss':>12} {'FP16 Precision':>15}\")\n",
+    "        print(f\"   {'-'*12} {'-'*12} {'-'*15}\")\n",
+    "\n",
+    "        for scale in loss_scales:\n",
+    "            scaled_loss = base_loss * scale\n",
+    "\n",
+    "            # Check if loss is representable in FP16\n",
+    "            if scaled_loss > fp16_min_normal and scaled_loss < fp16_max:\n",
+    "                precision = \"Good\"\n",
+    "            elif scaled_loss <= fp16_min_normal:\n",
+    "                precision = \"Underflow risk\"\n",
+    "            else:\n",
+    "                precision = \"Overflow risk\"\n",
+    "\n",
+    "            print(f\"   {scale:>12.0f} {scaled_loss:>12.3f} {precision:>15}\")\n",
+    "\n",
+    "        print(f\"\\n⚖️ Loss Function Mixed Precision Recommendations:\")\n",
+    "\n",
+    "        recommendations = [\n",
+    "            (\"MSE\", \"Monitor for gradient explosion in high-dynamic-range problems\", \"Medium risk\"),\n",
+    "            (\"CrossEntropy\", \"Use FP32 for softmax computation, FP16 for storage\", \"High risk\"),\n",
+    "            (\"Binary CE\", \"Stable formulation handles FP16 well with proper scaling\", \"Low risk\")\n",
+    "        ]\n",
+    "\n",
+    "        for loss_type, recommendation, risk in recommendations:\n",
+    "            print(f\"   {loss_type:>12}: {recommendation} ({risk})\")\n",
+    "\n",
+    "        print(f\"\\n🔧 Implementation Best Practices for Mixed Precision:\")\n",
+    "\n",
+    "        best_practices = [\n",
+    "            \"1. Use automatic mixed precision (AMP) libraries that handle scaling\",\n",
+    "            \"2. Keep loss computation in FP32, only cast inputs to FP16\",\n",
+    "            \"3. Monitor for overflow/underflow during training\",\n",
+    "            \"4. Use gradient clipping to prevent extreme gradients\",\n",
+    "            \"5. Scale losses up during forward pass, scale gradients down during backward\"\n",
+    "        ]\n",
+    "\n",
+    "        for practice in best_practices:\n",
+    "            print(f\"      {practice}\")\n",
+    "\n",
+    "        # Example mixed precision training pattern\n",
+    "        print(f\"\\n💻 Mixed Precision Training Pattern:\")\n",
+    "        print(f\"   ```python\")\n",
+    "        print(f\"   # Forward pass in FP16\")\n",
+    "        print(f\"   with autocast():\")\n",
+    "        print(f\"       predictions = model(inputs.half())  # FP16 inputs\")\n",
+    "        print(f\"       loss = loss_fn(predictions, targets)  # Loss computed in FP32\")\n",
+    "        print(f\"   \")\n",
+    "        print(f\"   # Scale loss to prevent underflow\")\n",
+    "        print(f\"   scaled_loss = loss * scale_factor\")\n",
+    "        print(f\"   scaled_loss.backward()\")\n",
+    "        print(f\"   \")\n",
+    "        print(f\"   # Unscale gradients before optimizer step\")\n",
+    "        print(f\"   scaler.step(optimizer)  # Automatically unscales gradients\")\n",
+    "        print(f\"   ```\")\n",
+    "\n",
+    "        # 💡 WHY THIS MATTERS: Mixed precision training can provide 1.5-2× speedup\n",
+    "        # and 50% memory reduction, but loss functions must be carefully implemented\n",
+    "        # to handle the reduced numerical precision without losing training stability.\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Mixed precision analysis error: {e}\")\n",
+    "        print(\"Mixed precision analysis requires complete loss implementations\")\n",
+    "\n",
+    "🔍 SYSTEMS INSIGHT: Production Deployment Analysis\n",
+    "def analyze_production_deployment_patterns():\n",
+    "    \"\"\"Analyze how loss functions affect production ML system design.\"\"\"\n",
+    "    print(\"🔍 Production Deployment Pattern Analysis\")\n",
+    "    print(\"=\" * 50)\n",
+    "    \n",
+    "    try:\n",
+    "        print(\"\\n🚀 Deployment Scenario Analysis:\")\n",
+    "        \n",
+    "        # Different deployment scenarios with constraints\n",
+    "        scenarios = [\n",
+    "            {\n",
+    "                \"name\": \"Mobile App (Spam Detection)\",\n",
+    "                \"constraints\": \"Memory < 50MB, Latency < 100ms\",\n",
+    "                \"problem\": \"Binary classification\",\n",
+    "                \"recommendation\": \"Binary CrossEntropy\",\n",
+    "                \"reasoning\": \"Minimal memory, fast inference, stable numerics\"\n",
+    "            },\n",
+    "            {\n",
+    "                \"name\": \"Cloud API (Image Classification)\", \n",
+    "                \"constraints\": \"Throughput > 1000 QPS, Cost optimization\",\n",
+    "                \"problem\": \"1000-class classification\",\n",
+    "                \"recommendation\": \"CrossEntropy with mixed precision\",\n",
+    "                \"reasoning\": \"Can handle memory cost, needs throughput\"\n",
+    "            },\n",
+    "            {\n",
+    "                \"name\": \"Edge IoT (Temperature Prediction)\",\n",
+    "                \"constraints\": \"Memory < 1MB, Power < 1W\",\n",
+    "                \"problem\": \"Regression\",\n",
+    "                \"recommendation\": \"MSE with quantization\",\n",
+    "                \"reasoning\": \"Minimal compute, no transcendental functions\"\n",
+    "            },\n",
+    "            {\n",
+    "                \"name\": \"Large Language Model Training\",\n",
+    "                \"constraints\": \"50K vocabulary, Multi-GPU\",\n",
+    "                \"problem\": \"Next token prediction\",\n",
+    "                \"recommendation\": \"Hierarchical Softmax or Sampling\",\n",
+    "                \"reasoning\": \"Standard CrossEntropy too memory intensive\"\n",
+    "            }\n",
+    "        ]\n",
+    "        \n",
+    "        for scenario in scenarios:\n",
+    "            print(f\"\\n   📱 {scenario['name']}:\")\n",
+    "            print(f\"      Constraints:     {scenario['constraints']}\")\n",
+    "            print(f\"      Problem Type:    {scenario['problem']}\")\n",
+    "            print(f\"      Best Loss:       {scenario['recommendation']}\")\n",
+    "            print(f\"      Why:             {scenario['reasoning']}\")\n",
+    "        \n",
+    "        print(\"\\n⚖️ Production Trade-off Analysis:\")\n",
+    "        \n",
+    "        trade_offs = [\n",
+    "            (\"Memory Efficiency\", \"MSE > Binary CE >> CrossEntropy\"),\n",
+    "            (\"Computational Speed\", \"MSE > Binary CE > CrossEntropy\"),\n",
+    "            (\"Numerical Stability\", \"MSE ≈ Binary CE > CrossEntropy\"), \n",
+    "            (\"Implementation Complexity\", \"MSE > CrossEntropy > Binary CE\"),\n",
+    "            (\"Gradient Quality\", \"CrossEntropy > Binary CE > MSE\"),\n",
+    "            (\"Debug-ability\", \"MSE > Binary CE > CrossEntropy\")\n",
+    "        ]\n",
+    "        \n",
+    "        for criterion, ranking in trade_offs:\n",
+    "            print(f\"      {criterion:20}: {ranking}\")\n",
+    "        \n",
+    "        print(\"\\n🔧 Framework Integration Patterns:\")\n",
+    "        \n",
+    "        frameworks = [\n",
+    "            (\"PyTorch\", \"nn.MSELoss(), nn.CrossEntropyLoss(), nn.BCEWithLogitsLoss()\"),\n",
+    "            (\"TensorFlow\", \"keras.losses.MSE, SparseCategoricalCrossentropy, BinaryCrossentropy\"),\n",
+    "            (\"JAX\", \"optax.l2_loss, optax.softmax_cross_entropy, optax.sigmoid_binary_cross_entropy\"),\n",
+    "            (\"Production\", \"Custom implementations with monitoring and fallbacks\")\n",
+    "        ]\n",
+    "        \n",
+    "        for framework, losses in frameworks:\n",
+    "            print(f\"      {framework:12}: {losses}\")\n",
+    "        \n",
+    "        # 💡 WHY THIS MATTERS: Loss function choice affects every aspect\n",
+    "        # of ML system design - from memory requirements to latency to\n",
+    "        # debugging complexity. Understanding these trade-offs enables\n",
+    "        # informed architectural decisions for production systems.\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️ Deployment analysis error: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f0245d3",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "## 🤔 ML Systems Thinking: Interactive Questions\n",
+    "\n",
+    "Now that you've implemented all core loss functions and analyzed their systems characteristics, let's explore their implications for real ML systems:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0789afbb",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "question-1-loss-selection",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "🤔 **Question 1: Loss Function Selection for Production Systems**\n",
+    "\n",
+    "You're building a production recommendation system that predicts user ratings (1-5 stars) for movies.\n",
+    "\n",
+    "Your team proposes three approaches:\n",
+    "A) Regression approach: Use MSE loss with continuous outputs (1.0-5.0)\n",
+    "B) Classification approach: Use CrossEntropy loss with 5 distinct classes  \n",
+    "C) Ordinal approach: Use a custom loss that penalizes being off by multiple stars more heavily\n",
+    "\n",
+    "Analyze each approach considering your implementations:\n",
+    "\n",
+    "**Technical Analysis:**\n",
+    "- How does the memory scaling of CrossEntropy (O(batch_size × num_classes)) affect this 5-class problem?\n",
+    "- What are the computational complexity differences between MSE's O(n) and CrossEntropy's O(n×c) for c=5?\n",
+    "- How do the gradient behaviors differ? (MSE's quadratic vs CrossEntropy's logarithmic penalties)\n",
+    "\n",
+    "**Systems Implications:**\n",
+    "- Which approach would be most memory efficient for large batch training?\n",
+    "- How does numerical stability differ when handling edge cases (ratings at boundaries)?\n",
+    "- Which approach would have the most predictable inference latency?\n",
+    "\n",
+    "**Business Alignment:**\n",
+    "- How well does each loss function's penalty structure match the business objective?\n",
+    "- What happens with fractional ratings like 3.7? How would each approach handle this?\n",
+    "- Which approach would be easiest to monitor and debug in production?\n",
+    "\n",
+    "Recommend an approach with justification based on your implementation experience.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "583f52ea",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "question-2-numerical-stability",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "🤔 **Question 2: Debugging Numerical Stability in Production**\n",
+    "\n",
+    "Your cross-entropy loss function works perfectly in development, but in production you start seeing NaN losses that crash training after several hours.\n",
+    "\n",
+    "**Root Cause Analysis:**\n",
+    "Based on your implementation of the log-sum-exp trick and epsilon clipping:\n",
+    "1. What specific numerical computations in cross-entropy can produce NaN values?\n",
+    "2. Walk through how your `max_logits = np.max(prediction_logits, axis=1, keepdims=True)` prevents overflow\n",
+    "3. Explain why `np.clip(softmax_pred, epsilon, 1.0 - epsilon)` prevents underflow\n",
+    "4. What would happen if you removed epsilon clipping? Trace through the computation.\n",
+    "\n",
+    "**Production Debugging:**\n",
+    "Given millions of training examples, how would you:\n",
+    "1. Identify which specific inputs trigger the numerical instability?\n",
+    "2. Modify your CrossEntropy implementation to add monitoring without affecting performance?\n",
+    "3. Design fallback behavior when numerical issues are detected?\n",
+    "4. Validate that your fixes don't change the mathematical behavior for normal inputs?\n",
+    "\n",
+    "**Comparison Analysis:**\n",
+    "- How does your stable Binary CrossEntropy formulation `max(x,0) - x*y + log(1 + exp(-|x|))` prevent similar issues?\n",
+    "- Why is MSE generally more numerically stable than CrossEntropy?\n",
+    "- How would you modify loss functions for mixed precision (FP16) training where numerical ranges are more limited?\n",
+    "\n",
+    "Research how PyTorch and TensorFlow handle these same challenges in their loss implementations.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f65771b",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "question-3-custom-loss-design",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "🤔 **Question 3: Implementing and Optimizing Custom Loss Functions**\n",
+    "\n",
+    "You've seen examples of custom loss functions for business objectives. Now analyze implementation and optimization challenges:\n",
+    "\n",
+    "**Scenario Analysis:**\n",
+    "Choose one custom loss from the examples (Asymmetric BCE, Focal Loss, Ranking-Aware, Multi-Task, or Contrastive) and analyze:\n",
+    "\n",
+    "**Implementation Deep Dive:**\n",
+    "1. Trace through the numerical computation step-by-step for your chosen custom loss\n",
+    "2. Identify potential numerical stability issues compared to standard loss functions\n",
+    "3. How does the computational complexity compare to MSE/CrossEntropy/Binary CE?\n",
+    "4. What additional memory overhead does the custom formulation introduce?\n",
+    "\n",
+    "**Gradient Flow Analysis:**\n",
+    "5. How do the custom weighting schemes affect gradient magnitudes during backpropagation?\n",
+    "6. What happens to gradient flow when the custom weights become extreme (very large or very small)?\n",
+    "7. How would you detect and handle gradient explosion or vanishing in your custom loss?\n",
+    "8. Design gradient clipping strategies specific to your chosen custom loss function\n",
+    "\n",
+    "**Production Integration Challenges:**\n",
+    "9. How would you implement your custom loss to work with mixed precision training (FP16)?\n",
+    "10. What logging and monitoring would you add to track custom loss behavior in production?\n",
+    "11. How would you A/B test a custom loss against standard losses without affecting user experience?\n",
+    "12. Design a rollback strategy if the custom loss causes training instability\n",
+    "\n",
+    "**Performance Optimization:**\n",
+    "13. Identify computational bottlenecks in your chosen custom loss implementation\n",
+    "14. How could you vectorize operations to improve batch processing efficiency?\n",
+    "15. What caching strategies could reduce redundant computations?\n",
+    "16. How would you benchmark training speed impact compared to standard losses?\n",
+    "\n",
+    "**Business Validation Framework:**\n",
+    "17. Design metrics to validate that your custom loss actually improves business objectives\n",
+    "18. How would you separate loss function improvements from other training improvements?\n",
+    "19. What offline evaluation would you perform before deploying the custom loss?\n",
+    "20. How would you monitor for unexpected business metric changes after deployment?\n",
+    "\n",
+    "Implement one optimization for your chosen custom loss and explain how it addresses a specific production challenge.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ed8ca84",
+   "metadata": {
+    "cell_marker": "\"\"\""
+   },
+   "source": [
+    "## 🎯 MODULE SUMMARY: Loss Functions - Learning Objectives Made Mathematical\n",
+    "\n",
+    "Congratulations! You've successfully implemented the complete foundation for neural network training objectives:\n",
+    "\n",
+    "### What You've Accomplished\n",
+    "✅ **Complete Loss Function Library**: MSE for regression, CrossEntropy for multi-class classification, and Binary CrossEntropy for binary classification with production-grade numerical stability\n",
+    "✅ **Systems Engineering Understanding**: Deep comprehension of computational complexity, memory scaling, and numerical stability requirements for reliable ML systems\n",
+    "✅ **Mathematical Implementation Mastery**: Built loss functions from mathematical foundations through stable computational formulations to working code\n",
+    "✅ **Production Readiness Knowledge**: Understanding of how loss function choice affects training speed, memory usage, and deployment feasibility\n",
+    "✅ **Framework Integration Insight**: Clear connection between your implementations and how PyTorch/TensorFlow solve the same problems\n",
+    "\n",
+    "### Key Learning Outcomes\n",
+    "- **Loss Function Theory**: How mathematical loss functions translate business objectives into optimization targets that neural networks can learn from\n",
+    "- **Numerical Stability Engineering**: Critical importance of stable implementations that prevent catastrophic training failures in production systems\n",
+    "- **Systems Performance Analysis**: Understanding of computational complexity, memory scaling, and performance trade-offs that affect production deployment\n",
+    "- **Production ML Patterns**: Knowledge of how loss function choice affects system architecture, monitoring requirements, and debugging complexity\n",
+    "\n",
+    "### Mathematical Foundations Mastered  \n",
+    "- **MSE computation**: `(1/n) × Σ(y_pred - y_true)²` with smooth quadratic gradients for regression optimization\n",
+    "- **CrossEntropy with stable softmax**: Log-sum-exp trick and epsilon clipping for numerically robust classification\n",
+    "- **Binary CrossEntropy stability**: `max(x,0) - x×y + log(1 + exp(-|x|))` formulation preventing overflow/underflow issues\n",
+    "- **Gradient behavior understanding**: How different loss functions create different optimization landscapes and learning dynamics\n",
+    "\n",
+    "### Professional Skills Developed\n",
+    "- **Production-quality implementation**: Robust numerical stability measures that prevent training failures with real-world data\n",
+    "- **Performance optimization**: Understanding of computational and memory complexity that affects scalability and deployment\n",
+    "- **Systems debugging**: Knowledge of how to identify and fix numerical stability issues in production ML systems\n",
+    "- **Framework integration**: Clear understanding of how your implementations connect to professional ML development workflows\n",
+    "\n",
+    "### Ready for Advanced Applications\n",
+    "Your loss function implementations now enable:\n",
+    "- **Complete training loops** that optimize neural networks on real datasets with proper convergence monitoring\n",
+    "- **Custom loss functions** that align with specific business objectives and domain requirements\n",
+    "- **Production deployment** with confidence in numerical stability and performance characteristics\n",
+    "- **Advanced optimization** techniques that build on solid loss function foundations\n",
+    "\n",
+    "### Connection to Real ML Systems\n",
+    "Your implementations mirror the essential patterns used in:\n",
+    "- **PyTorch's loss functions**: Same mathematical formulations with identical numerical stability measures\n",
+    "- **TensorFlow's losses**: Equivalent computational patterns and production-grade error handling\n",
+    "- **Production ML pipelines**: The exact loss functions that power real ML systems at companies like Google, Meta, and OpenAI\n",
+    "- **Research frameworks**: Foundation for experimenting with novel loss functions and training objectives\n",
+    "\n",
+    "### Next Steps\n",
+    "With solid loss function implementations, you're ready to:\n",
+    "1. **Export your module**: `tito module complete 04_losses`\n",
+    "2. **Validate integration**: `tito test --module losses`\n",
+    "3. **Explore autograd integration**: See how loss functions connect with automatic differentiation\n",
+    "4. **Ready for Module 06**: Build automatic gradient computation that makes loss-based learning possible!\n",
+    "\n",
+    "**Your achievement**: You've built the mathematical foundation that transforms predictions into learning signals - the critical bridge between model outputs and optimization objectives that makes neural network training possible!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfc087a8",
+   "metadata": {
+    "nbgrader": {
+     "grade": false,
+     "grade_id": "final-demo",
+     "locked": false,
+     "schema_version": 3,
+     "solution": false,
+     "task": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    print(\"🔥 TinyTorch Loss Functions Module - Complete Demo\")\n",
+    "    print(\"=\" * 55)\n",
+    "    \n",
+    "    # Test all core implementations\n",
+    "    print(\"\\n🧪 Testing All Loss Functions:\")\n",
+    "    test_unit_mse_loss()\n",
+    "    test_unit_crossentropy_loss()\n",
+    "    test_unit_binary_crossentropy_loss()\n",
+    "    test_unit_comprehensive_loss_integration()\n",
+    "    \n",
+    "    # Run systems analysis functions\n",
+    "    print(\"\\n\" + \"=\"*60)\n",
+    "    print(\"🔍 Systems Analysis Functions\")\n",
+    "    print(\"=\" * 30)\n",
+    "\n",
+    "    visualize_loss_landscapes()\n",
+    "    analyze_mse_properties()\n",
+    "    analyze_crossentropy_stability()\n",
+    "    analyze_binary_crossentropy_efficiency()\n",
+    "    analyze_mixed_precision_considerations()\n",
+    "    analyze_loss_performance_characteristics()\n",
+    "    analyze_numerical_stability_edge_cases()\n",
+    "    analyze_production_deployment_patterns()\n",
+    "    \n",
+    "    print(\"\\n\" + \"=\"*60)\n",
+    "    print(\"📊 Loss Function Usage Examples\")\n",
+    "    print(\"=\" * 35)\n",
+    "    \n",
+    "    # Example 1: Regression with MSE\n",
+    "    print(\"\\n1. Regression Example (Predicting House Prices):\")\n",
+    "    mse = MeanSquaredError()\n",
+    "    house_predictions = Tensor([[250000, 180000, 320000]])  # Predicted prices\n",
+    "    house_actual = Tensor([[240000, 175000, 315000]])       # Actual prices\n",
+    "    regression_loss = mse(house_predictions, house_actual)\n",
+    "    print(f\"   House price prediction loss: ${regression_loss.data:,.0f}² average error\")\n",
+    "    \n",
+    "    # Example 2: Multi-class classification with CrossEntropy\n",
+    "    print(\"\\n2. Multi-Class Classification Example (Image Recognition):\")\n",
+    "    ce = CrossEntropyLoss()\n",
+    "    image_logits = Tensor([[2.1, 0.5, -0.3, 1.8, 0.1],      # Model outputs for 5 classes\n",
+    "                          [-0.2, 3.1, 0.8, -1.0, 0.4]])      # (cat, dog, bird, fish, rabbit)\n",
+    "    true_classes = Tensor([0, 1])  # First image = cat, second = dog\n",
+    "    classification_loss = ce(image_logits, true_classes)\n",
+    "    print(f\"   Image classification loss: {classification_loss.data:.4f}\")\n",
+    "    \n",
+    "    # Example 3: Binary classification with BCE\n",
+    "    print(\"\\n3. Binary Classification Example (Spam Detection):\")\n",
+    "    bce = BinaryCrossEntropyLoss()\n",
+    "    spam_logits = Tensor([[1.2], [-0.8], [2.1], [-1.5]])  # Spam prediction logits\n",
+    "    spam_labels = Tensor([[1.0], [0.0], [1.0], [0.0]])     # 1=spam, 0=not spam\n",
+    "    spam_loss = bce(spam_logits, spam_labels)\n",
+    "    print(f\"   Spam detection loss: {spam_loss.data:.4f}\")\n",
+    "    \n",
+    "    print(\"\\n\" + \"=\"*60)\n",
+    "    print(\"🎯 Loss Function Characteristics\")\n",
+    "    print(\"=\" * 35)\n",
+    "    \n",
+    "    # Compare perfect vs imperfect predictions\n",
+    "    print(\"\\n📊 Perfect vs Random Predictions:\")\n",
+    "    \n",
+    "    # Perfect predictions\n",
+    "    perfect_mse = mse(Tensor([5.0]), Tensor([5.0]))\n",
+    "    perfect_ce = ce(Tensor([[10.0, 0.0, 0.0]]), Tensor([0]))\n",
+    "    perfect_bce = bce(Tensor([10.0]), Tensor([1.0]))\n",
+    "    \n",
+    "    print(f\"   Perfect MSE loss: {perfect_mse.data:.6f}\")\n",
+    "    print(f\"   Perfect CE loss:  {perfect_ce.data:.6f}\")\n",
+    "    print(f\"   Perfect BCE loss: {perfect_bce.data:.6f}\")\n",
+    "    \n",
+    "    # Random predictions\n",
+    "    random_mse = mse(Tensor([3.0]), Tensor([5.0]))  # Off by 2\n",
+    "    random_ce = ce(Tensor([[0.0, 0.0, 0.0]]), Tensor([0]))  # Uniform distribution\n",
+    "    random_bce = bce(Tensor([0.0]), Tensor([1.0]))  # 50% confidence\n",
+    "    \n",
+    "    print(f\"   Random MSE loss:  {random_mse.data:.6f}\")\n",
+    "    print(f\"   Random CE loss:   {random_ce.data:.6f}\")\n",
+    "    print(f\"   Random BCE loss:  {random_bce.data:.6f}\")\n",
+    "    \n",
+    "    print(\"\\n🎉 Complete loss function foundation ready!\")\n",
+    "    print(\"   ✅ MSE for regression problems\")\n",
+    "    print(\"   ✅ CrossEntropy for multi-class classification\")\n",
+    "    print(\"   ✅ Binary CrossEntropy for binary classification\")\n",
+    "    print(\"   ✅ Numerically stable implementations\")\n",
+    "    print(\"   ✅ Production-ready batch processing\")\n",
+    "    print(\"   ✅ Systems analysis and performance insights\")\n",
+    "    print(\"   ✅ Ready for neural network training!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "main_language": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/modules/04_losses/losses_dev.py b/modules/04_losses/losses_dev.py
index 149b1bbc..9bb2463c 100644
--- a/modules/04_losses/losses_dev.py
+++ b/modules/04_losses/losses_dev.py
@@ -72,7 +72,7 @@ try:
     # In a complete system, these would integrate with the autograd Variable system
 except ImportError:
     # For development, import from local modules
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
     from tensor_dev import Tensor
 
 # %% nbgrader={"grade": false, "grade_id": "losses-setup", "locked": false, "schema_version": 3, "solution": false, "task": false}
@@ -85,7 +85,7 @@ print("Ready to build loss functions for neural network training!")
 """
 ## Where This Code Lives in the Final Package
 
-**Learning Side:** You work in modules/05_losses/losses_dev.py  
+**Learning Side:** You work in modules/04_losses/losses_dev.py  
 **Building Side:** Code exports to tinytorch.core.losses
 
 ```python
@@ -2081,7 +2081,7 @@ Your implementations mirror the essential patterns used in:
 
 ### Next Steps
 With solid loss function implementations, you're ready to:
-1. **Export your module**: `tito module complete 05_losses`
+1. **Export your module**: `tito module complete 04_losses`
 2. **Validate integration**: `tito test --module losses`
 3. **Explore autograd integration**: See how loss functions connect with automatic differentiation
 4. **Ready for Module 06**: Build automatic gradient computation that makes loss-based learning possible!
diff --git a/modules/07_training/training_dev.py b/modules/07_training/training_dev.py
index 0cbf39d4..c825f5cf 100644
--- a/modules/07_training/training_dev.py
+++ b/modules/07_training/training_dev.py
@@ -51,9 +51,9 @@ import time
 import pickle
 
 # Add module directories to Python path
-sys.path.append(os.path.abspath('modules/source/02_tensor'))
-sys.path.append(os.path.abspath('modules/source/03_activations'))
-sys.path.append(os.path.abspath('modules/source/04_layers'))
+sys.path.append(os.path.abspath('modules/source/01_tensor'))
+sys.path.append(os.path.abspath('modules/source/02_activations'))
+sys.path.append(os.path.abspath('modules/source/03_layers'))
 sys.path.append(os.path.abspath('modules/source/05_networks'))
 sys.path.append(os.path.abspath('modules/source/06_autograd'))
 sys.path.append(os.path.abspath('modules/source/07_spatial'))
@@ -67,14 +67,34 @@ sys.path.append(os.path.abspath('modules/source/09_dataloader'))
 # No longer needed
 
 # Import all the building blocks we need
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax
-from tinytorch.core.layers import Linear
-from tinytorch.core.networks import Sequential, create_mlp
-from tinytorch.core.spatial import Conv2D, flatten
-from tinytorch.utils.data import Dataset, DataLoader
-from tinytorch.core.autograd import Variable  # FOR AUTOGRAD INTEGRATION
-from tinytorch.core.optimizers import SGD, Adam
+try:
+    from tinytorch.core.tensor import Tensor
+    from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax
+    from tinytorch.core.layers import Linear
+    from tinytorch.core.networks import Sequential, create_mlp
+    from tinytorch.core.spatial import Conv2D, flatten
+    from tinytorch.utils.data import Dataset, DataLoader
+    from tinytorch.core.autograd import Variable  # FOR AUTOGRAD INTEGRATION
+    from tinytorch.core.optimizers import SGD, Adam
+except ImportError:
+    # For development - import from local modules
+    import sys
+    import os
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_activations'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '03_layers'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '06_optimizers'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '08_spatial'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '09_dataloader'))
+    
+    from tensor_dev import Tensor
+    from activations_dev import ReLU, Sigmoid, Tanh, Softmax
+    from layers_dev import Linear, Sequential, create_mlp
+    from spatial_dev import Conv2D, flatten
+    from dataloader_dev import Dataset, DataLoader
+    from autograd_dev import Variable
+    from optimizers_dev import SGD, Adam
 
 # 🔥 AUTOGRAD INTEGRATION: Loss functions now return Variables that support .backward()
 # This enables automatic gradient computation for neural network training!
diff --git a/modules/08_spatial/spatial_dev.py b/modules/08_spatial/spatial_dev.py
index b3594398..0ba4a036 100644
--- a/modules/08_spatial/spatial_dev.py
+++ b/modules/08_spatial/spatial_dev.py
@@ -57,9 +57,9 @@ try:
 except ImportError:
     # Development mode - import from local module files
     sys.path.extend([
-        os.path.join(os.path.dirname(__file__), '..', '02_tensor'),
-        os.path.join(os.path.dirname(__file__), '..', '03_activations'), 
-        os.path.join(os.path.dirname(__file__), '..', '04_layers')
+        os.path.join(os.path.dirname(__file__), '..', '01_tensor'),
+        os.path.join(os.path.dirname(__file__), '..', '02_activations'), 
+        os.path.join(os.path.dirname(__file__), '..', '03_layers')
     ])
     from tensor_dev import Tensor, Parameter
     from activations_dev import ReLU
diff --git a/modules/10_tokenization/tokenization_dev.py b/modules/10_tokenization/tokenization_dev.py
index e081e706..ec3b1f3b 100644
--- a/modules/10_tokenization/tokenization_dev.py
+++ b/modules/10_tokenization/tokenization_dev.py
@@ -55,7 +55,7 @@ try:
     from tinytorch.core.tensor import Tensor
 except ImportError:
     # For development, import from local tensor module
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
     from tensor_dev import Tensor
 
 # %% nbgrader={"grade": false, "grade_id": "tokenization-welcome", "locked": false, "schema_version": 3, "solution": false, "task": false}
diff --git a/modules/11_embeddings/embeddings_dev.py b/modules/11_embeddings/embeddings_dev.py
index e2a2d0e4..82ad6f5f 100644
--- a/modules/11_embeddings/embeddings_dev.py
+++ b/modules/11_embeddings/embeddings_dev.py
@@ -54,7 +54,7 @@ try:
     from tinytorch.core.tensor import Tensor
 except ImportError:
     # For development, import from local tensor module
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
     from tensor_dev import Tensor
 
 # Try to import tokenization classes
diff --git a/modules/12_attention/attention_dev.py b/modules/12_attention/attention_dev.py
index 07e0b945..f79a4d83 100644
--- a/modules/12_attention/attention_dev.py
+++ b/modules/12_attention/attention_dev.py
@@ -60,7 +60,7 @@ try:
     from tinytorch.core.tensor import Tensor
 except ImportError:
     # For development, import from local tensor module
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
     from tensor_dev import Tensor
 
 # Try to import embedding classes
diff --git a/modules/13_transformers/transformers_dev.py b/modules/13_transformers/transformers_dev.py
index a0c87bb1..94997f91 100644
--- a/modules/13_transformers/transformers_dev.py
+++ b/modules/13_transformers/transformers_dev.py
@@ -57,7 +57,7 @@ def _import_from_module_dev(module_name, class_names):
     module_path = os.path.join(os.path.dirname(__file__), '..', module_name)
     sys.path.insert(0, module_path)
     try:
-        if module_name == '02_tensor':
+        if module_name == '01_tensor':
             from tensor_dev import Tensor
             return {'Tensor': Tensor}
         elif module_name == '13_attention':
@@ -81,7 +81,7 @@ if 'tinytorch' in sys.modules:
     from tinytorch.core.embeddings import Embedding, PositionalEncoding
 else:
     # Development: Import from local modules
-    tensor_imports = _import_from_module_dev('02_tensor', ['Tensor'])
+    tensor_imports = _import_from_module_dev('01_tensor', ['Tensor'])
     Tensor = tensor_imports['Tensor']
     
     attention_imports = _import_from_module_dev('13_attention', 
diff --git a/modules/16_quantization/quantization_dev.py b/modules/16_quantization/quantization_dev.py
index a1db5829..7f5ede84 100644
--- a/modules/16_quantization/quantization_dev.py
+++ b/modules/16_quantization/quantization_dev.py
@@ -65,7 +65,7 @@ try:
     from tinytorch.core.spatial import Conv2d, MaxPool2D
 except ImportError:
     # For development, import from local modules
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
     sys.path.append(os.path.join(os.path.dirname(__file__), '..', '06_spatial'))
     try:
         from tensor_dev import Tensor
diff --git a/modules/18_caching/caching_dev.py b/modules/18_caching/caching_dev.py
index 1583dd5b..28fe94a4 100644
--- a/modules/18_caching/caching_dev.py
+++ b/modules/18_caching/caching_dev.py
@@ -57,7 +57,7 @@ try:
     from tinytorch.core.tensor import Tensor
 except ImportError:
     # For development, import from local tensor module
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
+    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
     from tensor_dev import Tensor
 
 # Try to import attention classes
diff --git a/progress.json b/progress.json
index f9bf5e98..c55c4699 100644
--- a/progress.json
+++ b/progress.json
@@ -1,11 +1,16 @@
 {
   "completed_modules": [
-    "01"
+    "01",
+    "02",
+    "03",
+    "06",
+    "08"
   ],
-  "last_completed": "01",
-  "last_updated": "2025-09-28T07:57:44.694673",
+  "last_completed": "08",
+  "last_updated": "2025-09-28T08:07:12.088651",
   "started_modules": [
-    "01"
+    "01",
+    "04"
   ],
-  "last_worked": "01"
+  "last_worked": "04"
 }
\ No newline at end of file