diff --git a/modules/01_tensor/tensor_dev.py b/modules/01_tensor/tensor_dev.py index 9025db0e..31e638e1 100644 --- a/modules/01_tensor/tensor_dev.py +++ b/modules/01_tensor/tensor_dev.py @@ -1410,7 +1410,7 @@ Your tensor implementation now enables: - **Real data processing**: Handle images, text, and complex multi-dimensional datasets ### Export Your Work -1. **Export to package**: `tito module complete 02_tensor` +1. **Export to package**: `tito module complete 01_tensor` 2. **Verify integration**: Your Tensor class will be available as `tinytorch.core.tensor.Tensor` 3. **Enable next module**: Activations build on your tensor foundation diff --git a/modules/02_activations/activations_dev.py b/modules/02_activations/activations_dev.py index 616f8879..6c5ffbdb 100644 --- a/modules/02_activations/activations_dev.py +++ b/modules/02_activations/activations_dev.py @@ -61,7 +61,7 @@ try: from tinytorch.core.tensor import Tensor except ImportError: # For development - import from local modules - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor # In[ ]: @@ -1203,7 +1203,7 @@ if __name__ == "__main__": # - **Industry Standard**: Every major ML framework prioritizes optimizing these specific activation functions # ### Next Steps -# 1. **Export your module**: `tito module complete 03_activations` +# 1. **Export your module**: `tito module complete 02_activations` # 2. **Validate integration**: `tito test --module activations` # 3. **Explore activation variants**: Experiment with Leaky ReLU or GELU implementations # 4. **Ready for Module 04**: Layers - combining your activations with linear transformations! diff --git a/modules/03_layers/layers_dev.py b/modules/03_layers/layers_dev.py index 65693736..4026037c 100644 --- a/modules/03_layers/layers_dev.py +++ b/modules/03_layers/layers_dev.py @@ -71,7 +71,7 @@ else: # Development: Import from local module files # During development, we need to import directly from the source files # This allows us to work with modules before they're packaged - tensor_module_path = os.path.join(os.path.dirname(__file__), '..', '02_tensor') + tensor_module_path = os.path.join(os.path.dirname(__file__), '..', '01_tensor') sys.path.insert(0, tensor_module_path) try: from tensor_dev import Tensor, Parameter diff --git a/modules/04_losses/losses_dev.ipynb b/modules/04_losses/losses_dev.ipynb new file mode 100644 index 00000000..8f7ab4fe --- /dev/null +++ b/modules/04_losses/losses_dev.ipynb @@ -0,0 +1,2532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "54a999b1", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Loss Functions - Learning Objectives Made Mathematical\n", + "\n", + "Welcome to Loss Functions! You'll implement the critical bridge between model predictions and learning objectives that makes neural network training possible.\n", + "\n", + "## πŸ”— Building on Previous Learning\n", + "**What You Built Before**:\n", + "- Module 02 (Tensor): Data structures for predictions and targets\n", + "- Module 03 (Activations): Nonlinear transformations for model outputs\n", + "- Module 04 (Layers): Complete neural network layers that produce predictions\n", + "\n", + "**What's Working**: You can build networks that transform inputs into predictions!\n", + "\n", + "**The Gap**: Predictions aren't learning objectives - you need to measure how \"wrong\" predictions are and provide gradient signals for improvement.\n", + "\n", + "**This Module's Solution**: Implement MSE, CrossEntropy, and BinaryCrossEntropy loss functions with numerical stability.\n", + "\n", + "**Connection Map**:\n", + "```\n", + "Layers β†’ Loss Functions β†’ Gradients\n", + "(predictions) (objectives) (learning signals)\n", + "```\n", + "\n", + "## Learning Goals (Systems-Focused)\n", + "- **Systems understanding**: How loss functions translate business problems into optimization objectives with proper numerical stability\n", + "- **Core implementation skill**: Build production-quality loss functions with stable computation and efficient batch processing\n", + "- **Pattern mastery**: Understand how different loss functions shape learning dynamics and convergence behavior\n", + "- **Framework connections**: See how your implementations mirror PyTorch's loss functions and autograd integration patterns\n", + "- **Optimization trade-offs**: Learn why numerical stability and computational efficiency matter for reliable training at scale\n", + "\n", + "## Build β†’ Use β†’ Reflect\n", + "1. **Build**: Complete loss function implementations with numerical stability and gradient support\n", + "2. **Use**: Apply loss functions to regression and classification problems with real neural networks\n", + "3. **Reflect**: Why do different loss functions lead to different learning behaviors, and when does numerical stability matter?\n", + "\n", + "## What You'll Achieve\n", + "By implementing loss functions from scratch, you'll understand:\n", + "- Deep technical understanding of how loss functions quantify prediction quality and enable learning\n", + "- Practical capability to implement numerically stable loss computation for production ML systems\n", + "- Systems insight into computational complexity, memory requirements, and batch processing efficiency\n", + "- Performance awareness of how loss function choice affects training speed and convergence characteristics\n", + "- Production knowledge of how frameworks implement robust loss computation with proper error handling\n", + "\n", + "## Systems Reality Check\n", + "πŸ’‘ **Production Context**: PyTorch's loss functions use numerically stable implementations and automatic mixed precision to handle extreme gradients and values\n", + "⚑ **Performance Insight**: Numerically unstable loss functions can cause training to fail catastrophically - proper implementation is critical for reliable ML systems" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfe05289", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "losses-imports", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "#| default_exp core.losses\n", + "\n", + "#| export\n", + "import numpy as np\n", + "import sys\n", + "import os\n", + "\n", + "# Import our building blocks - try package first, then local modules\n", + "try:\n", + " from tinytorch.core.tensor import Tensor\n", + " # Note: For now, we'll use simplified implementations without full autograd\n", + " # In a complete system, these would integrate with the autograd Variable system\n", + "except ImportError:\n", + " # For development, import from local modules\n", + " sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))\n", + " from tensor_dev import Tensor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0f986fc", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "losses-setup", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "print(\"πŸ”₯ TinyTorch Loss Functions Module\")\n", + "print(f\"NumPy version: {np.__version__}\")\n", + "print(f\"Python version: {sys.version_info.major}.{sys.version_info.minor}\")\n", + "print(\"Ready to build loss functions for neural network training!\")" + ] + }, + { + "cell_type": "markdown", + "id": "899f0152", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "## Where This Code Lives in the Final Package\n", + "\n", + "**Learning Side:** You work in modules/04_losses/losses_dev.py \n", + "**Building Side:** Code exports to tinytorch.core.losses\n", + "\n", + "```python\n", + "# Final package structure:\n", + "from tinytorch.core.losses import MeanSquaredError, CrossEntropyLoss, BinaryCrossEntropyLoss # All loss functions!\n", + "from tinytorch.core.tensor import Tensor # The foundation\n", + "from tinytorch.core.layers import Linear, Sequential # Network components\n", + "```\n", + "\n", + "**Why this matters:**\n", + "- **Learning:** Focused module for understanding loss functions and training objectives\n", + "- **Production:** Proper organization like PyTorch's torch.nn with all loss functions together\n", + "- **Consistency:** All loss functions live together in core.losses for easy access\n", + "- **Integration:** Works seamlessly with tensors and neural networks for complete training systems" + ] + }, + { + "cell_type": "markdown", + "id": "409b9591", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Understanding Loss Functions in Neural Networks\n", + "\n", + "## What are Loss Functions?\n", + "\n", + "Loss functions are the mathematical bridge between what your model predicts and what you want it to learn. They quantify the \"distance\" between predictions and reality.\n", + "\n", + "```\n", + "Business Goal: \"Predict house prices accurately\"\n", + " ↓\n", + "Mathematical Loss: MSE = (predicted_price - actual_price)Β²\n", + " ↓ \n", + "Optimization Signal: gradient = 2 Γ— (predicted - actual)\n", + " ↓\n", + "Learning Update: parameter -= learning_rate Γ— gradient\n", + "```\n", + "\n", + "## The Learning Ecosystem\n", + "\n", + "Loss functions provide four critical capabilities:\n", + "\n", + "🎯 **Learning Objectives**: Define what \"good\" performance means mathematically \n", + "πŸ“ˆ **Gradient Signal**: Provide directional improvement information for parameters \n", + "πŸ” **Progress Measurement**: Enable monitoring training progress and convergence detection \n", + "βš–οΈ **Trade-off Control**: Balance different aspects of model performance and regularization \n", + "\n", + "## Visual Understanding: Loss Function Landscape\n", + "\n", + "```\n", + "Loss Function Behavior:\n", + " MSE Loss CrossEntropy Loss\n", + " High β”‚ β•±β•² High β”‚ β•±β•²\n", + " β”‚ β•± β•² β”‚ β•± β•²\n", + " β”‚ β•± β•² β”‚ β•± β•²\n", + " β”‚ β•± β•² β”‚ β•± β•²\n", + " Low β”‚β•± β•² Low β”‚ β•± β•²\n", + " └────────────── └──────────────\n", + " Wrong Right Wrong Right\n", + " \n", + " β€’ Smooth gradients β€’ Steep near wrong predictions\n", + " β€’ Quadratic penalty β€’ Gentle near correct predictions\n", + " β€’ Good for regression β€’ Good for classification\n", + "```\n", + "\n", + "Different loss functions create different optimization landscapes that affect how your model learns!" + ] + }, + { + "cell_type": "markdown", + "id": "429bbae2", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Mean Squared Error - Foundation for Regression\n", + "\n", + "MSE is the cornerstone loss function for regression problems. It measures prediction quality by penalizing large errors more than small ones.\n", + "\n", + "## Visual Understanding: MSE Behavior\n", + "\n", + "```\n", + "MSE Loss Visualization:\n", + " \n", + " Loss β”‚ β•±β•²\n", + " 4 β”‚ β•± β•² β€’ Error = 2 β†’ Loss = 4\n", + " 3 β”‚ β•± β•² β€’ Error = 1 β†’ Loss = 1\n", + " 2 β”‚ β•± β•² β€’ Error = 0 β†’ Loss = 0\n", + " 1 β”‚ β•± β•² β€’ Quadratic penalty!\n", + " 0 β”‚β•±__________β•²____\n", + " -2 -1 0 1 2\n", + " Error\n", + " \n", + "Gradient Flow:\n", + " βˆ‚Loss/βˆ‚prediction = 2 Γ— (predicted - actual)\n", + " \n", + " Large errors β†’ Large gradients β†’ Big updates\n", + " Small errors β†’ Small gradients β†’ Fine tuning\n", + "```\n", + "\n", + "## Mathematical Foundation\n", + "\n", + "For batch of predictions and targets:\n", + "```\n", + "MSE = (1/n) Γ— Ξ£(y_pred - y_true)Β²\n", + "\n", + "Gradient: βˆ‚MSE/βˆ‚y_pred = (2/n) Γ— (y_pred - y_true)\n", + "```\n", + "\n", + "## Learning Objectives\n", + "By implementing MSE, you'll understand:\n", + "- How regression loss functions translate continuous prediction errors into optimization signals\n", + "- Why squared error creates smooth, well-behaved gradients for stable optimization\n", + "- How batch processing enables efficient training on multiple samples simultaneously\n", + "- The connection between mathematical loss formulations and practical ML training dynamics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80f4f2d2", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "mse-concept-question", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "πŸ€” **Computational Question: MSE Properties**\n", + "\n", + "Before implementing, let's understand MSE behavior:\n", + "\n", + "1. If you predict house price as $300k but actual is $250k, what's the MSE?\n", + "2. If you predict $310k but actual is $250k, what's the MSE? \n", + "3. Which error gets penalized more heavily and why?\n", + "4. How does this relate to the quadratic penalty we visualized?\n", + "\n", + "This understanding will guide your implementation approach.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2533af31", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "mse-loss-implementation", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "#| export\n", + "class MeanSquaredError:\n", + " \"\"\"\n", + " Mean Squared Error Loss for Regression Problems\n", + " \n", + " Computes the average squared difference between predictions and targets:\n", + " MSE = (1/n) Γ— Ξ£(y_pred - y_true)Β²\n", + " \n", + " Features:\n", + " - Numerically stable computation\n", + " - Efficient batch processing\n", + " - Clean gradient properties for optimization\n", + " - Compatible with tensor operations\n", + " \n", + " Example Usage:\n", + " mse = MeanSquaredError()\n", + " loss = mse(predictions, targets) # Returns scalar loss value\n", + " \"\"\"\n", + " \n", + " def __init__(self):\n", + " \"\"\"Initialize MSE loss function.\"\"\"\n", + " pass\n", + " \n", + " def __call__(self, y_pred, y_true):\n", + " \"\"\"\n", + " Compute MSE loss between predictions and targets.\n", + " \n", + " Args:\n", + " y_pred: Model predictions (Tensor, shape: [batch_size, ...])\n", + " y_true: True targets (Tensor, shape: [batch_size, ...])\n", + " \n", + " Returns:\n", + " Tensor with scalar loss value\n", + " \n", + " TODO: Implement MSE computation with proper tensor handling.\n", + " \n", + " APPROACH:\n", + " 1. Convert inputs to tensors for consistent processing\n", + " 2. Compute element-wise prediction errors (differences)\n", + " 3. Square the errors to create quadratic penalty\n", + " 4. Take mean across all elements for final loss\n", + " \n", + " EXAMPLE:\n", + " >>> mse = MeanSquaredError()\n", + " >>> pred = Tensor([[1.0, 2.0]])\n", + " >>> true = Tensor([[1.5, 1.5]])\n", + " >>> loss = mse(pred, true)\n", + " >>> print(loss.data)\n", + " 0.25 # [(1.0-1.5)Β² + (2.0-1.5)Β²] / 2 = [0.25 + 0.25] / 2\n", + " \n", + " HINTS:\n", + " - Use np.mean() for efficient batch averaging\n", + " - Element-wise operations work naturally with tensor.data\n", + " - Return result wrapped in Tensor for consistent interface\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " # Step 1: Ensure we have tensor inputs for consistent processing\n", + " if not isinstance(y_pred, Tensor):\n", + " y_pred = Tensor(y_pred)\n", + " if not isinstance(y_true, Tensor):\n", + " y_true = Tensor(y_true)\n", + " \n", + " # Step 2: Compute mean squared error with element-wise operations\n", + " prediction_errors = y_pred.data - y_true.data # Element-wise difference\n", + " squared_errors = prediction_errors * prediction_errors # Element-wise squaring\n", + " mean_loss = np.mean(squared_errors) # Average across all elements\n", + " \n", + " return Tensor(mean_loss)\n", + " ### END SOLUTION\n", + " \n", + " def forward(self, y_pred, y_true):\n", + " \"\"\"Alternative interface for forward pass.\"\"\"\n", + " return self.__call__(y_pred, y_true)\n", + "\n", + "# πŸ” SYSTEMS INSIGHT: Gradient Landscape Visualization\n", + "def visualize_loss_landscapes():\n", + " \"\"\"Visualize how different loss functions create different optimization landscapes.\"\"\"\n", + " print(\"πŸ” Loss Function Landscape Visualization\")\n", + " print(\"=\" * 45)\n", + "\n", + " try:\n", + " import numpy as np\n", + "\n", + " # Create prediction space for visualization\n", + " prediction_range = np.linspace(-3, 3, 100)\n", + " true_value = 0.0 # Target value\n", + "\n", + " print(\"\\nπŸ“ˆ Loss Landscape Comparison:\")\n", + " print(\" How loss changes as predictions move away from target\")\n", + "\n", + " # Calculate loss landscapes\n", + " mse = MeanSquaredError()\n", + " ce = CrossEntropyLoss()\n", + " bce = BinaryCrossEntropyLoss()\n", + "\n", + " # MSE landscape (regression)\n", + " mse_losses = []\n", + " for pred in prediction_range:\n", + " loss = mse(Tensor([pred]), Tensor([true_value]))\n", + " mse_losses.append(loss.data)\n", + "\n", + " # Binary CE landscape (classification)\n", + " bce_losses = []\n", + " for pred in prediction_range:\n", + " loss = bce(Tensor([pred]), Tensor([1.0])) # Target: positive class\n", + " bce_losses.append(loss.data)\n", + "\n", + " # Find key gradient characteristics\n", + " mse_gradient_at_zero = 2 * (0 - true_value) # MSE gradient formula\n", + " mse_gradient_at_one = 2 * (1 - true_value)\n", + "\n", + " print(f\"\\n🎯 Gradient Behavior Analysis:\")\n", + " print(f\" MSE gradient at prediction=0: {mse_gradient_at_zero:.3f}\")\n", + " print(f\" MSE gradient at prediction=1: {mse_gradient_at_one:.3f}\")\n", + " print(f\" MSE provides linear gradient growth\")\n", + "\n", + " # Binary CE gradient analysis\n", + " sigmoid_at_zero = 1 / (1 + np.exp(-0)) # = 0.5\n", + " bce_grad_at_zero = sigmoid_at_zero - 1.0 # = -0.5\n", + " sigmoid_at_one = 1 / (1 + np.exp(-1)) # β‰ˆ 0.73\n", + " bce_grad_at_one = sigmoid_at_one - 1.0 # β‰ˆ -0.27\n", + "\n", + " print(f\" BCE gradient at logit=0: {bce_grad_at_zero:.3f}\")\n", + " print(f\" BCE gradient at logit=1: {bce_grad_at_one:.3f}\")\n", + " print(f\" BCE provides adaptive gradient magnitude\")\n", + "\n", + " # Visualize ASCII loss curves\n", + " print(f\"\\nπŸ“Š Loss Function Shapes (ASCII visualization):\")\n", + " print(f\" Prediction range: {prediction_range[0]:.1f} to {prediction_range[-1]:.1f}\")\n", + "\n", + " # Sample key points for visualization\n", + " sample_points = [-2, -1, 0, 1, 2]\n", + " print(f\"\\n {'Prediction':>10} {'MSE Loss':>10} {'BCE Loss':>10} {'Gradient Type':>15}\")\n", + " print(f\" {'-'*10} {'-'*10} {'-'*10} {'-'*15}\")\n", + "\n", + " for point in sample_points:\n", + " mse_loss = mse(Tensor([point]), Tensor([0.0]))\n", + " bce_loss = bce(Tensor([point]), Tensor([1.0]))\n", + "\n", + " # Characterize gradient steepness\n", + " if abs(point) < 0.5:\n", + " grad_type = \"Gentle\"\n", + " elif abs(point) < 1.5:\n", + " grad_type = \"Moderate\"\n", + " else:\n", + " grad_type = \"Steep\"\n", + "\n", + " print(f\" {point:>10.1f} {mse_loss.data:>10.3f} {bce_loss.data:>10.3f} {grad_type:>15}\")\n", + "\n", + " # Optimization implications\n", + " print(f\"\\nπŸš€ Optimization Implications:\")\n", + " print(f\" MSE (Regression):\")\n", + " print(f\" β€’ Quadratic penalty grows smoothly\")\n", + " print(f\" β€’ Large errors β†’ large gradients (aggressive correction)\")\n", + " print(f\" β€’ Small errors β†’ small gradients (fine-tuning)\")\n", + " print(f\" β€’ Symmetric around target value\")\n", + "\n", + " print(f\" Binary CrossEntropy (Classification):\")\n", + " print(f\" β€’ Logarithmic penalty creates adaptive gradients\")\n", + " print(f\" β€’ Wrong confident predictions β†’ steep gradients\")\n", + " print(f\" β€’ Right confident predictions β†’ gentle gradients\")\n", + " print(f\" β€’ Asymmetric penalty structure encourages confidence\")\n", + "\n", + " # πŸ’‘ WHY THIS MATTERS: Different loss landscapes create different\n", + " # optimization dynamics. MSE's smooth quadratic surface enables\n", + " # stable gradient descent, while CrossEntropy's adaptive gradients\n", + " # help classification models learn faster from confident mistakes.\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Visualization error: {e}\")\n", + " print(\"Ensure loss functions are implemented for landscape analysis\")\n", + "\n", + "# πŸ” SYSTEMS INSIGHT: MSE Computational Analysis\n", + "def analyze_mse_properties():\n", + " \"\"\"Analyze MSE loss characteristics for systems understanding.\"\"\"\n", + " print(\"πŸ” MSE Loss Analysis - Understanding the Math\")\n", + " print(\"=\" * 45)\n", + " \n", + " try:\n", + " mse = MeanSquaredError()\n", + " \n", + " # Error magnitude vs loss relationship\n", + " print(\"\\nπŸ“Š Error Magnitude vs Loss (Quadratic Penalty):\")\n", + " errors = [0.1, 0.5, 1.0, 2.0, 5.0]\n", + " for error in errors:\n", + " pred = Tensor([error])\n", + " true = Tensor([0.0])\n", + " loss = mse(pred, true)\n", + " print(f\" Error: {error:4.1f} β†’ Loss: {loss.data:8.3f} (Γ— {loss.data/(error**2):5.1f} baseline)\")\n", + " \n", + " # Batch vs individual processing\n", + " print(f\"\\n⚑ Batch Processing Efficiency:\")\n", + " single_losses = []\n", + " for i in range(100):\n", + " pred = Tensor([np.random.randn()])\n", + " true = Tensor([np.random.randn()])\n", + " loss = mse(pred, true)\n", + " single_losses.append(loss.data)\n", + " \n", + " # Batch version\n", + " batch_pred = Tensor(np.random.randn(100))\n", + " batch_true = Tensor(np.random.randn(100))\n", + " batch_loss = mse(batch_pred, batch_true)\n", + " \n", + " individual_mean = np.mean(single_losses)\n", + " print(f\" Individual losses mean: {individual_mean:.6f}\")\n", + " print(f\" Batch loss: {batch_loss.data:.6f}\")\n", + " print(f\" Difference: {abs(individual_mean - batch_loss.data):.8f}\")\n", + " \n", + " # Memory efficiency analysis\n", + " import sys\n", + " small_tensor = Tensor([1.0])\n", + " large_tensor = Tensor(np.random.randn(1000))\n", + " \n", + " print(f\"\\nπŸ’Ύ Memory Efficiency:\")\n", + " print(f\" Small loss memory: {sys.getsizeof(small_tensor.data)} bytes\")\n", + " print(f\" Large loss memory: {sys.getsizeof(large_tensor.data)} bytes\")\n", + " print(f\" MSE memory is independent of input size!\")\n", + " \n", + " # πŸ’‘ WHY THIS MATTERS: MSE provides stable, well-behaved gradients\n", + " # that are proportional to error magnitude, making optimization smooth.\n", + " # The quadratic penalty means large errors dominate learning initially,\n", + " # then fine-tuning happens as errors get smaller.\n", + " \n", + " except Exception as e:\n", + " print(f\"⚠️ Analysis error: {e}\")\n", + " print(\"Ensure MSE implementation is complete before running analysis\")" + ] + }, + { + "cell_type": "markdown", + "id": "c0b9be9f", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### πŸ§ͺ Unit Test: MSE Loss Computation\n", + "This test validates `MeanSquaredError.__call__`, ensuring correct MSE computation with various input types and batch sizes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39a9be44", + "metadata": { + "nbgrader": { + "grade": true, + "grade_id": "test-mse-loss", + "locked": true, + "points": 3, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_unit_mse_loss():\n", + " \"\"\"Test MSE loss implementation.\"\"\"\n", + " print(\"πŸ§ͺ Testing Mean Squared Error Loss...\")\n", + " \n", + " mse = MeanSquaredError()\n", + " \n", + " # Test case 1: Perfect predictions (loss should be 0)\n", + " y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]])\n", + " y_true = Tensor([[1.0, 2.0], [3.0, 4.0]])\n", + " loss = mse(y_pred, y_true)\n", + " assert abs(loss.data) < 1e-6, f\"Perfect predictions should have loss β‰ˆ 0, got {loss.data}\"\n", + " print(\"βœ… Perfect predictions test passed\")\n", + " \n", + " # Test case 2: Known loss computation\n", + " y_pred = Tensor([[1.0, 2.0]])\n", + " y_true = Tensor([[0.0, 1.0]])\n", + " loss = mse(y_pred, y_true)\n", + " expected = 1.0 # [(1-0)Β² + (2-1)Β²] / 2 = [1 + 1] / 2 = 1.0\n", + " assert abs(loss.data - expected) < 1e-6, f\"Expected loss {expected}, got {loss.data}\"\n", + " print(\"βœ… Known loss computation test passed\")\n", + " \n", + " # Test case 3: Batch processing\n", + " y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]])\n", + " y_true = Tensor([[1.5, 2.5], [2.5, 3.5]])\n", + " loss = mse(y_pred, y_true)\n", + " expected = 0.25 # All squared differences are 0.25\n", + " assert abs(loss.data - expected) < 1e-6, f\"Expected batch loss {expected}, got {loss.data}\"\n", + " print(\"βœ… Batch processing test passed\")\n", + " \n", + " # Test case 4: Single value\n", + " y_pred = Tensor([5.0])\n", + " y_true = Tensor([3.0])\n", + " loss = mse(y_pred, y_true)\n", + " expected = 4.0 # (5-3)Β² = 4\n", + " assert abs(loss.data - expected) < 1e-6, f\"Expected single value loss {expected}, got {loss.data}\"\n", + " print(\"βœ… Single value test passed\")\n", + " \n", + " print(\"πŸŽ‰ MSE loss tests passed! Understanding regression objectives.\")\n", + "\n", + "test_unit_mse_loss()" + ] + }, + { + "cell_type": "markdown", + "id": "48e960ae", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Cross-Entropy Loss - Foundation for Multi-Class Classification\n", + "\n", + "Cross-Entropy Loss measures the \"information distance\" between predicted probability distributions and true class labels. It's the gold standard for classification problems.\n", + "\n", + "## Visual Understanding: Cross-Entropy Behavior\n", + "\n", + "```\n", + "Cross-Entropy Loss for 3-Class Problem:\n", + "\n", + "Class Probabilities after Softmax:\n", + " Input: [2.0, 1.0, 0.1] β†’ Probabilities: [0.66, 0.24, 0.10]\n", + " True: Class 0 (index 0) β†’ Target: [1.0, 0.0, 0.0]\n", + " \n", + "Loss Computation:\n", + " CE = -log(probability_of_correct_class)\n", + " CE = -log(0.66) = 0.415\n", + " \n", + "Intuition:\n", + " High confidence + Correct β†’ Low loss\n", + " High confidence + Wrong β†’ High loss \n", + " Low confidence + Any β†’ Medium loss\n", + "\n", + "Gradient Behavior:\n", + " Wrong predictions β†’ Steep gradients β†’ Big corrections\n", + " Right predictions β†’ Gentle gradients β†’ Fine tuning\n", + "```\n", + "\n", + "## Numerical Stability Challenge\n", + "\n", + "```\n", + "The Numerical Stability Problem:\n", + " \n", + " Raw logits: [50.0, 49.0, 48.0]\n", + " Naive softmax: exp(50)/[exp(50)+exp(49)+exp(48)]\n", + " Problem: exp(50) β‰ˆ 5Γ—10Β²ΒΉ β†’ Overflow!\n", + " \n", + "Our Solution (Log-Sum-Exp Trick):\n", + " 1. max_val = max(logits) = 50.0\n", + " 2. stable_logits = [0.0, -1.0, -2.0] # Subtract max\n", + " 3. exp([0.0, -1.0, -2.0]) = [1.0, 0.37, 0.14]\n", + " 4. Safe softmax: [0.67, 0.25, 0.09]\n", + "```\n", + "\n", + "## Mathematical Foundation\n", + "\n", + "For predictions and class indices:\n", + "```\n", + "CrossEntropy = -Ξ£ y_true Γ— log(softmax(y_pred))\n", + "\n", + "Softmax: softmax(x_i) = exp(x_i) / Ξ£ exp(x_j)\n", + "Stable: softmax(x_i) = exp(x_i - max(x)) / Ξ£ exp(x_j - max(x))\n", + "```\n", + "\n", + "## Learning Objectives\n", + "By implementing Cross-Entropy, you'll understand:\n", + "- How classification losses work with probability distributions and information theory\n", + "- Why softmax normalization creates proper probability distributions for multi-class problems\n", + "- The critical importance of numerical stability in exponential and logarithmic computations\n", + "- How cross-entropy naturally encourages confident, correct predictions through its gradient structure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22a7ac21", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "crossentropy-concept-question", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "πŸ€” **Computational Question: CrossEntropy Stability**\n", + "\n", + "Consider numerical stability in cross-entropy:\n", + "\n", + "1. What happens if you compute exp(100) directly?\n", + "2. Why does subtracting the maximum value prevent overflow?\n", + "3. What happens if log(0) occurs during loss computation?\n", + "4. How does epsilon clipping prevent this issue?\n", + "\n", + "Understanding these edge cases is crucial for reliable implementation.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b638a54b", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "crossentropy-loss-implementation", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "#| export\n", + "class CrossEntropyLoss:\n", + " \"\"\"\n", + " Cross-Entropy Loss for Multi-Class Classification Problems\n", + " \n", + " Computes the cross-entropy between predicted probability distributions\n", + " and true class labels with numerically stable implementation.\n", + " \n", + " Features:\n", + " - Numerically stable softmax computation using log-sum-exp trick\n", + " - Support for both class indices and one-hot encoding\n", + " - Efficient batch processing with proper broadcasting\n", + " - Automatic handling of edge cases and extreme values\n", + " \n", + " Example Usage:\n", + " ce_loss = CrossEntropyLoss()\n", + " loss = ce_loss(logits, class_indices) # Returns scalar loss value\n", + " \"\"\"\n", + " \n", + " def __init__(self):\n", + " \"\"\"Initialize CrossEntropy loss function.\"\"\"\n", + " pass\n", + " \n", + " def __call__(self, y_pred, y_true):\n", + " \"\"\"\n", + " Compute CrossEntropy loss between predictions and targets.\n", + " \n", + " Args:\n", + " y_pred: Model predictions/logits (Tensor, shape: [batch_size, num_classes])\n", + " y_true: True class indices (Tensor, shape: [batch_size]) or one-hot encoding\n", + " \n", + " Returns:\n", + " Tensor with scalar loss value\n", + " \n", + " TODO: Implement CrossEntropy with numerically stable softmax computation.\n", + " \n", + " APPROACH:\n", + " 1. Convert inputs to tensors and handle single samples\n", + " 2. Apply log-sum-exp trick for numerically stable softmax\n", + " 3. Clip probabilities to prevent log(0) issues\n", + " 4. Compute cross-entropy based on target format (indices vs one-hot)\n", + " \n", + " EXAMPLE:\n", + " >>> ce = CrossEntropyLoss()\n", + " >>> logits = Tensor([[2.0, 1.0, 0.0]]) # Raw model outputs\n", + " >>> targets = Tensor([0]) # Class 0 is correct\n", + " >>> loss = ce(logits, targets)\n", + " >>> print(loss.data)\n", + " 0.407 # -log(softmax([2.0, 1.0, 0.0])[0])\n", + " \n", + " HINTS:\n", + " - Use np.max(axis=1, keepdims=True) for stable max computation\n", + " - Use np.clip(probabilities, 1e-15, 1.0-1e-15) to prevent log(0)\n", + " - Handle both index format [0,1,2] and one-hot format [[1,0,0], [0,1,0]]\n", + " - Use advanced indexing: probs[np.arange(batch_size), class_indices]\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " # Step 1: Ensure we have tensor inputs for consistent processing\n", + " if not isinstance(y_pred, Tensor):\n", + " y_pred = Tensor(y_pred) # Convert predictions to tensor format\n", + " if not isinstance(y_true, Tensor):\n", + " y_true = Tensor(y_true) # Convert targets to tensor format\n", + " \n", + " # Step 1: Extract numpy arrays for computation\n", + " prediction_logits = y_pred.data # Raw model outputs (pre-softmax)\n", + " target_labels = y_true.data # True class indices or one-hot vectors\n", + " \n", + " # Step 2: Handle both single predictions and batches consistently\n", + " if prediction_logits.ndim == 1:\n", + " prediction_logits = prediction_logits.reshape(1, -1) # Convert to batch format [1, num_classes]\n", + " \n", + " # Step 3: Apply numerically stable softmax transformation\n", + " # Subtract max to prevent overflow: exp(x-max) is equivalent but stable\n", + " max_logits = np.max(prediction_logits, axis=1, keepdims=True)\n", + " exp_pred = np.exp(prediction_logits - max_logits)\n", + " softmax_pred = exp_pred / np.sum(exp_pred, axis=1, keepdims=True)\n", + " \n", + " # Step 4: Prevent numerical instability in log computation\n", + " epsilon = 1e-15 # Small value to prevent log(0) β†’ -inf and log(1) β†’ 0 issues\n", + " softmax_pred = np.clip(softmax_pred, epsilon, 1.0 - epsilon)\n", + " \n", + " # Step 5: Compute cross-entropy loss based on target format\n", + " if len(target_labels.shape) == 1:\n", + " # Format A: y_true contains class indices [0, 1, 2, ...]\n", + " batch_size = target_labels.shape[0]\n", + " # Extract probabilities for correct classes using advanced indexing\n", + " correct_class_probs = softmax_pred[np.arange(batch_size), target_labels.astype(int)]\n", + " log_probs = np.log(correct_class_probs)\n", + " loss_value = -np.mean(log_probs) # Negative log-likelihood\n", + " else:\n", + " # Format B: y_true is one-hot encoded [[1,0,0], [0,1,0], ...]\n", + " log_probs = np.log(softmax_pred)\n", + " # Multiply one-hot targets with log probabilities, sum across classes\n", + " weighted_log_probs = target_labels * log_probs\n", + " loss_value = -np.mean(np.sum(weighted_log_probs, axis=1))\n", + " \n", + " return Tensor(loss_value)\n", + " ### END SOLUTION\n", + " \n", + " def forward(self, y_pred, y_true):\n", + " \"\"\"Alternative interface for forward pass.\"\"\"\n", + " return self.__call__(y_pred, y_true)\n", + "\n", + "# πŸ” SYSTEMS INSIGHT: CrossEntropy Stability Analysis\n", + "def analyze_crossentropy_stability():\n", + " \"\"\"Analyze numerical stability in cross-entropy computation.\"\"\"\n", + " print(\"πŸ” CrossEntropy Stability Analysis\")\n", + " print(\"=\" * 40)\n", + " \n", + " try:\n", + " ce = CrossEntropyLoss()\n", + " \n", + " # Test numerical stability with extreme values\n", + " print(\"\\n⚑ Numerical Stability Testing:\")\n", + " \n", + " # Extreme logits that would overflow in naive implementation\n", + " extreme_logits = Tensor([[100.0, 99.0, 98.0]])\n", + " safe_labels = Tensor([0])\n", + " \n", + " loss = ce(extreme_logits, safe_labels)\n", + " print(f\" Extreme logits [100, 99, 98]: Loss = {loss.data:.6f}\")\n", + " print(f\" No overflow or NaN: {not np.isnan(loss.data) and not np.isinf(loss.data)}\")\n", + " \n", + " # Test epsilon clipping effectiveness\n", + " print(f\"\\nπŸ›‘οΈ Epsilon Clipping Protection:\")\n", + " very_confident = Tensor([[10.0, -10.0, -10.0]]) # Very confident about class 0\n", + " confident_labels = Tensor([0])\n", + " \n", + " loss = ce(very_confident, confident_labels)\n", + " print(f\" Very confident correct prediction: Loss = {loss.data:.6f}\")\n", + " print(f\" Should be near 0: {loss.data < 0.01}\")\n", + " \n", + " # Compare different confidence levels\n", + " print(f\"\\nπŸ“Š Confidence vs Loss Relationship:\")\n", + " confidence_levels = [\n", + " (\"Low confidence\", [[0.1, 0.0, -0.1]]),\n", + " (\"Medium confidence\", [[1.0, 0.0, -1.0]]),\n", + " (\"High confidence\", [[5.0, 0.0, -5.0]]),\n", + " (\"Very high\", [[10.0, 0.0, -10.0]])\n", + " ]\n", + " \n", + " for name, logits in confidence_levels:\n", + " test_logits = Tensor(logits)\n", + " test_loss = ce(test_logits, Tensor([0]))\n", + " print(f\" {name:15}: Loss = {test_loss.data:.6f}\")\n", + " \n", + " # Memory efficiency for large vocabularies\n", + " print(f\"\\nπŸ’Ύ Memory Scaling Analysis:\")\n", + " small_vocab = Tensor(np.random.randn(32, 100)) # 100 classes\n", + " large_vocab = Tensor(np.random.randn(32, 10000)) # 10k classes\n", + " \n", + " import sys\n", + " small_memory = sys.getsizeof(small_vocab.data)\n", + " large_memory = sys.getsizeof(large_vocab.data)\n", + " \n", + " print(f\" Small vocab (100 classes): {small_memory / 1024:.1f} KB\")\n", + " print(f\" Large vocab (10k classes): {large_memory / 1024:.1f} KB\")\n", + " print(f\" Memory scales O(batch_size Γ— num_classes)\")\n", + " \n", + " # πŸ’‘ WHY THIS MATTERS: CrossEntropy memory scales with vocabulary size.\n", + " # This is why large language models use techniques like hierarchical softmax\n", + " # or sampling-based training to handle vocabularies with 50k+ tokens.\n", + " \n", + " except Exception as e:\n", + " print(f\"⚠️ Analysis error: {e}\")\n", + " print(\"Ensure CrossEntropy implementation is complete\")" + ] + }, + { + "cell_type": "markdown", + "id": "31b5abca", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### πŸ§ͺ Unit Test: Cross-Entropy Loss Computation\n", + "This test validates `CrossEntropyLoss.__call__`, ensuring correct cross-entropy computation with numerically stable softmax." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6062489", + "metadata": { + "nbgrader": { + "grade": true, + "grade_id": "test-crossentropy-loss", + "locked": true, + "points": 4, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_unit_crossentropy_loss():\n", + " \"\"\"Test CrossEntropy loss implementation.\"\"\"\n", + " print(\"πŸ§ͺ Testing Cross-Entropy Loss...\")\n", + " \n", + " ce = CrossEntropyLoss()\n", + " \n", + " # Test case 1: Perfect predictions\n", + " y_pred = Tensor([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0]]) # Very confident correct predictions\n", + " y_true = Tensor([0, 1]) # Class indices\n", + " loss = ce(y_pred, y_true)\n", + " assert loss.data < 0.1, f\"Perfect predictions should have low loss, got {loss.data}\"\n", + " print(\"βœ… Perfect predictions test passed\")\n", + " \n", + " # Test case 2: Random predictions (should have higher loss)\n", + " y_pred = Tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) # Uniform after softmax\n", + " y_true = Tensor([0, 1])\n", + " loss = ce(y_pred, y_true)\n", + " expected_random = -np.log(1.0/3.0) # log(1/num_classes) for uniform distribution\n", + " assert abs(loss.data - expected_random) < 0.1, f\"Random predictions should have loss β‰ˆ {expected_random}, got {loss.data}\"\n", + " print(\"βœ… Random predictions test passed\")\n", + " \n", + " # Test case 3: Binary classification\n", + " y_pred = Tensor([[2.0, 1.0], [1.0, 2.0]])\n", + " y_true = Tensor([0, 1])\n", + " loss = ce(y_pred, y_true)\n", + " assert 0.0 < loss.data < 2.0, f\"Binary classification loss should be reasonable, got {loss.data}\"\n", + " print(\"βœ… Binary classification test passed\")\n", + " \n", + " # Test case 4: One-hot encoded labels\n", + " y_pred = Tensor([[2.0, 1.0, 0.0], [0.0, 2.0, 1.0]])\n", + " y_true = Tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) # One-hot encoded\n", + " loss = ce(y_pred, y_true)\n", + " assert 0.0 < loss.data < 2.0, f\"One-hot encoded loss should be reasonable, got {loss.data}\"\n", + " print(\"βœ… One-hot encoded labels test passed\")\n", + " \n", + " print(\"πŸŽ‰ Cross-Entropy loss tests passed! Understanding classification objectives.\")\n", + "\n", + "test_unit_crossentropy_loss()" + ] + }, + { + "cell_type": "markdown", + "id": "13e8a85c", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Binary Cross-Entropy Loss - Optimized for Binary Classification\n", + "\n", + "Binary Cross-Entropy Loss is the specialized, efficient version of cross-entropy for binary (two-class) problems. It's more stable and faster than using regular cross-entropy with 2 classes.\n", + "\n", + "## Visual Understanding: Binary Cross-Entropy\n", + "\n", + "```\n", + "Binary Classification Landscape:\n", + "\n", + "Sigmoid Activation:\n", + " Raw Logit β†’ Sigmoid β†’ Probability β†’ Loss\n", + " -5.0 β†’ 0.007 β†’ 0.007 β†’ High loss (if true=1)\n", + " 0.0 β†’ 0.500 β†’ 0.500 β†’ Medium loss\n", + " +5.0 β†’ 0.993 β†’ 0.993 β†’ Low loss (if true=1)\n", + "\n", + "Loss Behavior:\n", + " BCE = -[yΓ—log(p) + (1-y)Γ—log(1-p)]\n", + " \n", + " For y=1 (positive class):\n", + " p=0.9 β†’ -log(0.9) = 0.105 (low loss)\n", + " p=0.1 β†’ -log(0.1) = 2.303 (high loss)\n", + " \n", + " For y=0 (negative class):\n", + " p=0.1 β†’ -log(0.9) = 0.105 (low loss) \n", + " p=0.9 β†’ -log(0.1) = 2.303 (high loss)\n", + "```\n", + "\n", + "## Numerical Stability Solution\n", + "\n", + "```\n", + "The Binary Cross-Entropy Stability Problem:\n", + " \n", + " BCE = -[yΓ—log(Οƒ(x)) + (1-y)Γ—log(1-Οƒ(x))]\n", + " \n", + " Where Οƒ(x) = 1/(1+exp(-x))\n", + " \n", + " Problems:\n", + " - Large positive x: exp(-x) β†’ 0, then log(1) β†’ 0 (loss of precision)\n", + " - Large negative x: Οƒ(x) β†’ 0, then log(0) β†’ -∞\n", + " \n", + "Our Stable Solution:\n", + " BCE = max(x,0) - xΓ—y + log(1 + exp(-|x|))\n", + " \n", + " Why this works:\n", + " - max(x,0) handles positive values\n", + " - -xΓ—y is the \"cross\" term \n", + " - log(1+exp(-|x|)) is always stable (exp≀1)\n", + "```\n", + "\n", + "## Mathematical Foundation\n", + "\n", + "For binary predictions and labels:\n", + "```\n", + "BCE = -y Γ— log(Οƒ(x)) - (1-y) Γ— log(1-Οƒ(x))\n", + "\n", + "Stable form: BCE = max(x,0) - xΓ—y + log(1 + exp(-|x|))\n", + "```\n", + "\n", + "## Learning Objectives\n", + "By implementing Binary Cross-Entropy, you'll understand:\n", + "- How binary classification creates simpler optimization landscapes than multi-class problems\n", + "- Why sigmoid activation naturally pairs with binary cross-entropy loss through its gradient structure\n", + "- The critical importance of numerically stable formulations for reliable production training\n", + "- How specialized binary losses achieve better efficiency and stability than general solutions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b7f8af9", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "binary-crossentropy-concept", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "πŸ€” **Computational Question: Binary Stability**\n", + "\n", + "Consider the stable BCE formulation:\n", + "\n", + "1. Why does max(x,0) - xΓ—y + log(1+exp(-|x|)) work?\n", + "2. What happens when x=100? (trace through the computation)\n", + "3. What happens when x=-100? (trace through the computation)\n", + "4. How does this prevent both overflow and underflow?\n", + "\n", + "This mathematical insight is crucial for production systems.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c53864df", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "binary-crossentropy-implementation", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "#| export\n", + "class BinaryCrossEntropyLoss:\n", + " \"\"\"\n", + " Binary Cross-Entropy Loss for Binary Classification Problems\n", + " \n", + " Computes binary cross-entropy between predictions and binary labels\n", + " with numerically stable sigmoid + BCE implementation.\n", + " \n", + " Features:\n", + " - Numerically stable computation from logits using stable BCE formula\n", + " - Efficient batch processing with vectorized operations\n", + " - Automatic sigmoid application through stable formulation\n", + " - Robust to extreme input values without overflow/underflow\n", + " \n", + " Example Usage:\n", + " bce_loss = BinaryCrossEntropyLoss()\n", + " loss = bce_loss(logits, binary_labels) # Returns scalar loss value\n", + " \"\"\"\n", + " \n", + " def __init__(self):\n", + " \"\"\"Initialize Binary CrossEntropy loss function.\"\"\"\n", + " pass\n", + " \n", + " def __call__(self, y_pred, y_true):\n", + " \"\"\"\n", + " Compute Binary CrossEntropy loss between predictions and targets.\n", + " \n", + " Args:\n", + " y_pred: Model predictions/logits (Tensor, shape: [batch_size, 1] or [batch_size])\n", + " y_true: True binary labels (Tensor, shape: [batch_size, 1] or [batch_size])\n", + " \n", + " Returns:\n", + " Tensor with scalar loss value\n", + " \n", + " TODO: Implement stable binary cross-entropy using the logits formulation.\n", + " \n", + " APPROACH:\n", + " 1. Convert inputs to tensors and flatten for consistent processing\n", + " 2. Use stable BCE formula: max(x,0) - xΓ—y + log(1+exp(-|x|))\n", + " 3. Apply this formula element-wise across the batch\n", + " 4. Return mean loss across all samples\n", + " \n", + " EXAMPLE:\n", + " >>> bce = BinaryCrossEntropyLoss()\n", + " >>> logits = Tensor([[2.0], [-1.0]]) # Raw outputs\n", + " >>> labels = Tensor([[1.0], [0.0]]) # Binary targets\n", + " >>> loss = bce(logits, labels)\n", + " >>> print(loss.data)\n", + " 0.693 # Stable computation of binary cross-entropy\n", + " \n", + " HINTS:\n", + " - Use np.maximum(logits, 0) for the max(x,0) term\n", + " - Use np.abs(logits) to ensure exp argument is ≀ 0\n", + " - The formula naturally handles both positive and negative logits\n", + " - Return np.mean() for batch averaging\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " # Step 1: Ensure we have tensor inputs for consistent processing\n", + " if not isinstance(y_pred, Tensor):\n", + " y_pred = Tensor(y_pred) # Convert predictions to tensor format\n", + " if not isinstance(y_true, Tensor):\n", + " y_true = Tensor(y_true) # Convert targets to tensor format\n", + " \n", + " # Get flat arrays for computation\n", + " logits = y_pred.data.flatten()\n", + " labels = y_true.data.flatten()\n", + " \n", + " # Step 1: Define numerically stable binary cross-entropy computation\n", + " def stable_bce_with_logits(logits, labels):\n", + " \"\"\"\n", + " Numerically stable BCE using the logits formulation:\n", + " BCE(logits, y) = max(logits, 0) - logits * y + log(1 + exp(-|logits|))\n", + " \n", + " This formulation prevents:\n", + " - exp(large_positive_logit) β†’ overflow\n", + " - log(very_small_sigmoid) β†’ -inf\n", + " \n", + " Mathematical equivalence:\n", + " - For positive logits: x - x*y + log(1 + exp(-x))\n", + " - For negative logits: -x*y + log(1 + exp(x))\n", + " \"\"\"\n", + " # Step 1a: Handle positive logits to prevent exp(large_positive) overflow\n", + " positive_part = np.maximum(logits, 0)\n", + " \n", + " # Step 1b: Subtract logit-label product (the \"cross\" in cross-entropy)\n", + " cross_term = logits * labels\n", + " \n", + " # Step 1c: Add log(1 + exp(-|logits|)) for numerical stability\n", + " # Using abs(logits) ensures the exponent is always negative or zero\n", + " stability_term = np.log(1 + np.exp(-np.abs(logits)))\n", + " \n", + " return positive_part - cross_term + stability_term\n", + " \n", + " # Step 2: Apply stable BCE computation across the batch\n", + " individual_losses = stable_bce_with_logits(logits, labels)\n", + " mean_loss = np.mean(individual_losses) # Average loss across batch\n", + " \n", + " return Tensor(mean_loss)\n", + " ### END SOLUTION\n", + " \n", + " def forward(self, y_pred, y_true):\n", + " \"\"\"Alternative interface for forward pass.\"\"\"\n", + " return self.__call__(y_pred, y_true)\n", + "\n", + "# πŸ” SYSTEMS INSIGHT: Binary CrossEntropy Efficiency Analysis\n", + "def analyze_binary_crossentropy_efficiency():\n", + " \"\"\"Analyze binary cross-entropy computational efficiency.\"\"\"\n", + " print(\"πŸ” Binary CrossEntropy Efficiency Analysis\")\n", + " print(\"=\" * 45)\n", + " \n", + " try:\n", + " bce = BinaryCrossEntropyLoss()\n", + " ce = CrossEntropyLoss() # For comparison\n", + " \n", + " # Compare binary-specific vs general cross-entropy\n", + " print(\"\\n⚑ Binary vs Multi-Class Efficiency:\")\n", + " \n", + " # Binary problem solved two ways\n", + " binary_logits = Tensor([[1.5], [-0.8], [2.1]])\n", + " binary_labels = Tensor([[1.0], [0.0], [1.0]])\n", + " \n", + " # Method 1: Binary CrossEntropy\n", + " binary_loss = bce(binary_logits, binary_labels)\n", + " \n", + " # Method 2: 2-class CrossEntropy (equivalent but less efficient)\n", + " multiclass_logits = Tensor([[1.5, 0.0], [-0.8, 0.0], [2.1, 0.0]])\n", + " multiclass_labels = Tensor([0, 1, 0]) # Convert to class indices\n", + " multiclass_loss = ce(multiclass_logits, multiclass_labels)\n", + " \n", + " print(f\" Binary CE Loss: {binary_loss.data:.6f}\")\n", + " print(f\" 2-Class CE Loss: {multiclass_loss.data:.6f}\")\n", + " print(f\" Difference: {abs(binary_loss.data - multiclass_loss.data):.8f}\")\n", + " \n", + " # Memory efficiency comparison\n", + " print(f\"\\nπŸ’Ύ Memory Efficiency Comparison:\")\n", + " \n", + " batch_size = 1000\n", + " binary_memory = batch_size * 1 * 8 # 1 value per sample, 8 bytes per float64\n", + " multiclass_memory = batch_size * 2 * 8 # 2 classes, 8 bytes per float64\n", + " \n", + " print(f\" Binary approach: {binary_memory / 1024:.1f} KB\")\n", + " print(f\" Multi-class (2): {multiclass_memory / 1024:.1f} KB\")\n", + " print(f\" Binary is {multiclass_memory/binary_memory:.1f}Γ— more memory efficient\")\n", + " \n", + " # Stability test with extreme values\n", + " print(f\"\\nπŸ›‘οΈ Extreme Value Stability:\")\n", + " extreme_tests = [\n", + " (\"Large positive\", [[100.0]], [[1.0]]),\n", + " (\"Large negative\", [[-100.0]], [[0.0]]),\n", + " (\"Mixed extreme\", [[100.0], [-100.0]], [[1.0], [0.0]])\n", + " ]\n", + " \n", + " for name, logits, labels in extreme_tests:\n", + " test_logits = Tensor(logits)\n", + " test_labels = Tensor(labels)\n", + " loss = bce(test_logits, test_labels)\n", + " is_stable = not (np.isnan(loss.data) or np.isinf(loss.data))\n", + " print(f\" {name:15}: Loss = {loss.data:.6f}, Stable = {is_stable}\")\n", + " \n", + " # πŸ’‘ WHY THIS MATTERS: Binary CrossEntropy is 2Γ— more memory efficient\n", + " # than regular CrossEntropy for binary problems, and provides better\n", + " # numerical stability through its specialized formulation.\n", + " \n", + " except Exception as e:\n", + " print(f\"⚠️ Analysis error: {e}\")\n", + " print(\"Ensure BinaryCrossEntropy implementation is complete\")" + ] + }, + { + "cell_type": "markdown", + "id": "cd8abd01", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### πŸ§ͺ Unit Test: Binary Cross-Entropy Loss\n", + "This test validates `BinaryCrossEntropyLoss.__call__`, ensuring stable binary cross-entropy computation with extreme values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "400a7568", + "metadata": { + "nbgrader": { + "grade": true, + "grade_id": "test-binary-crossentropy", + "locked": true, + "points": 4, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_unit_binary_crossentropy_loss():\n", + " \"\"\"Test Binary CrossEntropy loss implementation.\"\"\"\n", + " print(\"πŸ§ͺ Testing Binary Cross-Entropy Loss...\")\n", + " \n", + " bce = BinaryCrossEntropyLoss()\n", + " \n", + " # Test case 1: Perfect predictions\n", + " y_pred = Tensor([[10.0], [-10.0]]) # Very confident correct predictions\n", + " y_true = Tensor([[1.0], [0.0]])\n", + " loss = bce(y_pred, y_true)\n", + " assert loss.data < 0.1, f\"Perfect predictions should have low loss, got {loss.data}\"\n", + " print(\"βœ… Perfect predictions test passed\")\n", + " \n", + " # Test case 2: Random predictions (should have higher loss)\n", + " y_pred = Tensor([[0.0], [0.0]]) # 0.5 probability after sigmoid\n", + " y_true = Tensor([[1.0], [0.0]])\n", + " loss = bce(y_pred, y_true)\n", + " expected_random = -np.log(0.5) # log(0.5) for random guessing\n", + " assert abs(loss.data - expected_random) < 0.1, f\"Random predictions should have loss β‰ˆ {expected_random}, got {loss.data}\"\n", + " print(\"βœ… Random predictions test passed\")\n", + " \n", + " # Test case 3: Batch processing\n", + " y_pred = Tensor([[1.0], [2.0], [-1.0]])\n", + " y_true = Tensor([[1.0], [1.0], [0.0]])\n", + " loss = bce(y_pred, y_true)\n", + " assert 0.0 < loss.data < 2.0, f\"Batch processing loss should be reasonable, got {loss.data}\"\n", + " print(\"βœ… Batch processing test passed\")\n", + " \n", + " # Test case 4: Extreme values (test numerical stability)\n", + " y_pred = Tensor([[100.0], [-100.0]]) # Extreme logits\n", + " y_true = Tensor([[1.0], [0.0]])\n", + " loss = bce(y_pred, y_true)\n", + " assert not np.isnan(loss.data) and not np.isinf(loss.data), f\"Extreme values should not cause NaN/Inf, got {loss.data}\"\n", + " assert loss.data < 1.0, f\"Extreme correct predictions should have low loss, got {loss.data}\"\n", + " print(\"βœ… Extreme values test passed\")\n", + " \n", + " print(\"πŸŽ‰ Binary Cross-Entropy loss tests passed! Understanding binary objectives.\")\n", + "\n", + "test_unit_binary_crossentropy_loss()" + ] + }, + { + "cell_type": "markdown", + "id": "13b3bd16", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Custom Loss Functions - Aligning with Business Objectives\n", + "\n", + "Beyond standard loss functions, production ML systems often need custom losses that align with specific business objectives and domain constraints.\n", + "\n", + "## Business-Aligned Loss Design Patterns\n", + "\n", + "### Asymmetric Loss Functions\n", + "When false positives and false negatives have different costs:\n", + "\n", + "```python\n", + "# Medical diagnosis: False negatives (missing disease) cost 10Γ— more\n", + "class AsymmetricBinaryCrossEntropy(BinaryCrossEntropyLoss):\n", + " def __init__(self, false_negative_weight=10.0):\n", + " super().__init__()\n", + " self.fn_weight = false_negative_weight\n", + "\n", + " def __call__(self, y_pred, y_true):\n", + " # Standard BCE\n", + " base_loss = super().__call__(y_pred, y_true)\n", + "\n", + " # Weight false negatives more heavily\n", + " # When y_true=1 and y_pred is low, increase penalty\n", + " sigmoid_pred = 1 / (1 + np.exp(-y_pred.data))\n", + " fn_penalty = y_true.data * (1 - sigmoid_pred) * self.fn_weight\n", + "\n", + " weighted_loss = base_loss.data + np.mean(fn_penalty)\n", + " return Tensor(weighted_loss)\n", + "```\n", + "\n", + "### Focal Loss for Imbalanced Data\n", + "Addresses class imbalance by focusing on hard examples:\n", + "\n", + "```python\n", + "class FocalLoss(CrossEntropyLoss):\n", + " def __init__(self, alpha=1.0, gamma=2.0):\n", + " super().__init__()\n", + " self.alpha = alpha # Class balance weight\n", + " self.gamma = gamma # Focusing parameter\n", + "\n", + " def __call__(self, y_pred, y_true):\n", + " # Get standard cross-entropy\n", + " ce_loss = super().__call__(y_pred, y_true)\n", + "\n", + " # Calculate softmax probabilities\n", + " max_logits = np.max(y_pred.data, axis=1, keepdims=True)\n", + " stable_logits = y_pred.data - max_logits\n", + " exp_logits = np.exp(stable_logits)\n", + " softmax_probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)\n", + "\n", + " # Get probability of correct class\n", + " batch_size = y_true.data.shape[0]\n", + " correct_probs = softmax_probs[np.arange(batch_size), y_true.data.astype(int)]\n", + "\n", + " # Apply focal loss formula: -Ξ±(1-p)^Ξ³ log(p)\n", + " focal_weight = self.alpha * ((1 - correct_probs) ** self.gamma)\n", + " focal_loss = focal_weight * ce_loss.data\n", + "\n", + " return Tensor(np.mean(focal_loss))\n", + "```\n", + "\n", + "### Ranking-Aware Loss\n", + "For problems where order matters (search, recommendations):\n", + "\n", + "```python\n", + "class RankingAwareLoss:\n", + " def __init__(self, position_weights=None):\n", + " # Higher weights for top positions\n", + " self.position_weights = position_weights or [10.0, 5.0, 2.0, 1.0, 0.5]\n", + "\n", + " def __call__(self, predictions, targets, positions):\n", + " \"\"\"predictions: relevance scores, targets: true relevance, positions: result positions\"\"\"\n", + " mse = MeanSquaredError()\n", + "\n", + " # Weight errors by position importance\n", + " weighted_errors = []\n", + " for pred, target, pos in zip(predictions.data, targets.data, positions.data):\n", + " pos_weight = self.position_weights[min(int(pos), len(self.position_weights)-1)]\n", + " error = ((pred - target) ** 2) * pos_weight\n", + " weighted_errors.append(error)\n", + "\n", + " return Tensor(np.mean(weighted_errors))\n", + "```\n", + "\n", + "## Advanced Custom Loss Patterns\n", + "\n", + "### Multi-Task Learning Loss\n", + "Combining multiple objectives with learned weights:\n", + "\n", + "```python\n", + "class MultiTaskLoss:\n", + " def __init__(self, num_tasks=3):\n", + " # Learnable loss weights (log-variance parameterization for stability)\n", + " self.log_vars = [0.0] * num_tasks\n", + "\n", + " def __call__(self, predictions_list, targets_list):\n", + " \"\"\"predictions_list: [task1_preds, task2_preds, ...]\"\"\"\n", + " total_loss = 0\n", + "\n", + " for i, (preds, targets) in enumerate(zip(predictions_list, targets_list)):\n", + " # Choose appropriate loss for each task\n", + " if i == 0: # Regression task\n", + " task_loss = MeanSquaredError()(preds, targets)\n", + " else: # Classification tasks\n", + " task_loss = CrossEntropyLoss()(preds, targets)\n", + "\n", + " # Uncertainty-weighted combination\n", + " precision = np.exp(-self.log_vars[i])\n", + " weighted_loss = precision * task_loss.data + self.log_vars[i]\n", + " total_loss += weighted_loss\n", + "\n", + " return Tensor(total_loss)\n", + "```\n", + "\n", + "### Contrastive Loss for Similarity Learning\n", + "For learning embeddings and similarity:\n", + "\n", + "```python\n", + "class ContrastiveLoss:\n", + " def __init__(self, margin=1.0):\n", + " self.margin = margin\n", + "\n", + " def __call__(self, embeddings1, embeddings2, labels):\n", + " \"\"\"labels: 1 for similar pairs, 0 for dissimilar\"\"\"\n", + " # Euclidean distance between embeddings\n", + " distances = np.sqrt(np.sum((embeddings1.data - embeddings2.data) ** 2, axis=1))\n", + "\n", + " # Contrastive loss formula\n", + " positive_loss = labels.data * (distances ** 2)\n", + " negative_loss = (1 - labels.data) * np.maximum(0, self.margin - distances) ** 2\n", + "\n", + " total_loss = 0.5 * (positive_loss + negative_loss)\n", + " return Tensor(np.mean(total_loss))\n", + "```\n", + "\n", + "## Custom Loss Implementation Guidelines\n", + "\n", + "### Numerical Stability Considerations\n", + "```python\n", + "# Always include stability measures in custom losses\n", + "class StableCustomLoss:\n", + " def __call__(self, predictions, targets):\n", + " # 1. Input validation\n", + " if not isinstance(predictions, Tensor):\n", + " predictions = Tensor(predictions)\n", + "\n", + " # 2. Handle edge cases\n", + " predictions_clipped = np.clip(predictions.data, -100, 100) # Prevent overflow\n", + "\n", + " # 3. Use numerically stable formulations\n", + " # Avoid: exp(large_number), log(small_number)\n", + " # Use: log-sum-exp trick, epsilon clipping\n", + "\n", + " # 4. Return tensor for consistency\n", + " return Tensor(computed_loss)\n", + "```\n", + "\n", + "### Gradient-Friendly Design\n", + "```python\n", + "# Ensure gradients flow properly\n", + "class GradientFriendlyLoss:\n", + " def __call__(self, predictions, targets):\n", + " # Avoid operations that create zero gradients:\n", + " # - Hard thresholding: use soft approximations\n", + " # - Discrete operations: use continuous relaxations\n", + " # - Large plateaus: ensure non-zero gradients everywhere\n", + "\n", + " # Good: Smooth, differentiable operations\n", + " smooth_loss = self.smooth_l1_loss(predictions, targets)\n", + " return smooth_loss\n", + "\n", + " def smooth_l1_loss(self, pred, target, beta=1.0):\n", + " \"\"\"Smooth L1 loss - less sensitive to outliers than MSE\"\"\"\n", + " diff = np.abs(pred.data - target.data)\n", + " loss = np.where(diff < beta,\n", + " 0.5 * diff * diff / beta,\n", + " diff - 0.5 * beta)\n", + " return Tensor(np.mean(loss))\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "e84c5945", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Loss Function Application Guide and Comparison\n", + "\n", + "## When to Use Each Loss Function\n", + "\n", + "Understanding which loss function to use is critical for successful ML projects:\n", + "\n", + "### Mean Squared Error (MSE) - Regression Problems\n", + "```\n", + "Use when: Predicting continuous values\n", + "Examples: House prices, temperature, stock values, ages\n", + "Output: Any real number\n", + "Activation: Usually none (linear output)\n", + "Penalty: Quadratic (large errors >> small errors)\n", + "\n", + "Model Architecture:\n", + "Input β†’ Hidden Layers β†’ Linear Output β†’ MSE Loss\n", + "```\n", + "\n", + "### Cross-Entropy Loss - Multi-Class Classification \n", + "```\n", + "Use when: Choosing one class from 3+ options\n", + "Examples: Image classification, text categorization, medical diagnosis\n", + "Output: Probability distribution (sums to 1)\n", + "Activation: Softmax\n", + "Penalty: Logarithmic (encouraging confident correct predictions)\n", + "\n", + "Model Architecture:\n", + "Input β†’ Hidden Layers β†’ Softmax β†’ CrossEntropy Loss\n", + "```\n", + "\n", + "### Binary Cross-Entropy Loss - Binary Classification\n", + "```\n", + "Use when: Binary decisions (yes/no, positive/negative)\n", + "Examples: Spam detection, fraud detection, medical screening\n", + "Output: Single probability (0 to 1)\n", + "Activation: Sigmoid\n", + "Penalty: Asymmetric (confident wrong predictions heavily penalized)\n", + "\n", + "Model Architecture:\n", + "Input β†’ Hidden Layers β†’ Sigmoid β†’ Binary CrossEntropy Loss\n", + "```\n", + "\n", + "## Performance and Stability Comparison\n", + "\n", + "```\n", + "Computational Characteristics:\n", + " MSE CrossEntropy Binary CE\n", + "Time Complexity: O(n) O(nΓ—c) O(n)\n", + "Memory Complexity: O(1) O(nΓ—c) O(n)\n", + "Numerical Stability: High Medium High\n", + "Convergence Speed: Fast Medium Fast\n", + "\n", + "Where: n = batch size, c = number of classes\n", + "```\n", + "\n", + "## Integration with Neural Networks\n", + "\n", + "```python\n", + "# Example training setup for different problem types:\n", + "\n", + "# Regression Problem (House Price Prediction)\n", + "regression_model = Sequential([\n", + " Linear(10, 64), # Input features β†’ Hidden\n", + " ReLU(),\n", + " Linear(64, 1), # Hidden β†’ Single output\n", + " # No activation - linear output for regression\n", + "])\n", + "loss_fn = MeanSquaredError()\n", + "\n", + "# Multi-Class Classification (Image Recognition)\n", + "classification_model = Sequential([\n", + " Linear(784, 128), # Flattened image β†’ Hidden\n", + " ReLU(),\n", + " Linear(128, 10), # Hidden β†’ 10 classes\n", + " Softmax() # Convert to probabilities\n", + "])\n", + "loss_fn = CrossEntropyLoss()\n", + "\n", + "# Binary Classification (Spam Detection)\n", + "binary_model = Sequential([\n", + " Linear(100, 64), # Text features β†’ Hidden\n", + " ReLU(),\n", + " Linear(64, 1), # Hidden β†’ Single output\n", + " Sigmoid() # Convert to probability\n", + "])\n", + "loss_fn = BinaryCrossEntropyLoss()\n", + "\n", + "# Training loop pattern (same for all):\n", + "for batch in dataloader:\n", + " predictions = model(batch.inputs)\n", + " loss = loss_fn(predictions, batch.targets)\n", + " # loss.backward() # Compute gradients (when autograd is available)\n", + " # optimizer.step() # Update parameters\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "91ce7d95", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### πŸ§ͺ Comprehensive Integration Test\n", + "This test validates all loss functions work together correctly and can be used interchangeably in production systems." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9df44d7b", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "comprehensive-loss-tests", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_unit_comprehensive_loss_integration():\n", + " \"\"\"Test all loss functions work correctly together.\"\"\"\n", + " print(\"πŸ”¬ Comprehensive Loss Function Integration Testing\")\n", + " print(\"=\" * 55)\n", + " \n", + " # Test 1: All losses can be instantiated\n", + " print(\"\\n1. Loss Function Instantiation:\")\n", + " mse = MeanSquaredError()\n", + " ce = CrossEntropyLoss()\n", + " bce = BinaryCrossEntropyLoss()\n", + " print(\" βœ… All loss functions created successfully\")\n", + " \n", + " # Test 2: Loss functions return appropriate types\n", + " print(\"\\n2. Return Type Verification:\")\n", + " \n", + " # MSE test\n", + " pred = Tensor([[1.0, 2.0]])\n", + " target = Tensor([[1.0, 2.0]])\n", + " loss = mse(pred, target)\n", + " assert isinstance(loss, Tensor), \"MSE should return Tensor\"\n", + " assert loss.data.shape == (), \"MSE should return scalar\"\n", + " \n", + " # Cross-entropy test\n", + " pred = Tensor([[1.0, 2.0], [2.0, 1.0]])\n", + " target = Tensor([1, 0])\n", + " loss = ce(pred, target)\n", + " assert isinstance(loss, Tensor), \"CrossEntropy should return Tensor\"\n", + " assert loss.data.shape == (), \"CrossEntropy should return scalar\"\n", + " \n", + " # Binary cross-entropy test\n", + " pred = Tensor([[1.0], [-1.0]])\n", + " target = Tensor([[1.0], [0.0]])\n", + " loss = bce(pred, target)\n", + " assert isinstance(loss, Tensor), \"Binary CrossEntropy should return Tensor\"\n", + " assert loss.data.shape == (), \"Binary CrossEntropy should return scalar\"\n", + " \n", + " print(\" βœ… All loss functions return correct types\")\n", + " \n", + " # Test 3: Loss values are reasonable\n", + " print(\"\\n3. Loss Value Sanity Checks:\")\n", + " \n", + " # All losses should be non-negative\n", + " assert mse.forward(Tensor([1.0]), Tensor([2.0])).data >= 0, \"MSE should be non-negative\"\n", + " assert ce.forward(Tensor([[1.0, 0.0]]), Tensor([0])).data >= 0, \"CrossEntropy should be non-negative\"\n", + " assert bce.forward(Tensor([1.0]), Tensor([1.0])).data >= 0, \"Binary CrossEntropy should be non-negative\"\n", + " \n", + " print(\" βœ… All loss functions produce reasonable values\")\n", + " \n", + " # Test 4: Perfect predictions give low loss\n", + " print(\"\\n4. Perfect Prediction Tests:\")\n", + " \n", + " perfect_mse = mse(Tensor([5.0]), Tensor([5.0]))\n", + " perfect_ce = ce(Tensor([[10.0, 0.0]]), Tensor([0]))\n", + " perfect_bce = bce(Tensor([10.0]), Tensor([1.0]))\n", + " \n", + " assert perfect_mse.data < 1e-10, f\"Perfect MSE should be ~0, got {perfect_mse.data}\"\n", + " assert perfect_ce.data < 0.1, f\"Perfect CE should be low, got {perfect_ce.data}\"\n", + " assert perfect_bce.data < 0.1, f\"Perfect BCE should be low, got {perfect_bce.data}\"\n", + " \n", + " print(\" βœ… Perfect predictions produce low loss\")\n", + " \n", + " print(\"\\nπŸŽ‰ All comprehensive integration tests passed!\")\n", + " print(\" β€’ Loss functions instantiate correctly\")\n", + " print(\" β€’ Return types are consistent (Tensor scalars)\")\n", + " print(\" β€’ Loss values are mathematically sound\")\n", + " print(\" β€’ Perfect predictions are handled correctly\")\n", + " print(\" β€’ Ready for integration with neural network training!\")\n", + "\n", + "test_unit_comprehensive_loss_integration()" + ] + }, + { + "cell_type": "markdown", + "id": "5f2c082c", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Systems Analysis: Loss Function Performance and Engineering\n", + "\n", + "Let's analyze loss functions from an ML systems engineering perspective, focusing on performance, memory usage, and production implications.\n", + "\n", + "## Computational Complexity Deep Dive\n", + "\n", + "```\n", + "Algorithmic Analysis by Loss Type:\n", + "\n", + "MSE (Mean Squared Error):\n", + " Time: O(n) - linear in number of predictions\n", + " Space: O(1) - constant additional memory\n", + " Operations: n subtractions + n multiplications + 1 mean\n", + " Bottleneck: Memory bandwidth (simple arithmetic operations)\n", + " \n", + "CrossEntropy (Multi-Class):\n", + " Time: O(nΓ—c) - linear in samples Γ— classes \n", + " Space: O(nΓ—c) - store full probability distributions\n", + " Operations: nΓ—c exp + nΓ—c divisions + nΓ—c logs + reductions\n", + " Bottleneck: Exponential computations and memory bandwidth\n", + " \n", + "Binary CrossEntropy:\n", + " Time: O(n) - linear in number of samples\n", + " Space: O(n) - store one probability per sample\n", + " Operations: n max + n multiplications + n exp + n logs\n", + " Bottleneck: Transcendental functions (exp, log)\n", + "```\n", + "\n", + "## Memory Scaling Analysis\n", + "\n", + "Understanding memory requirements is crucial for large-scale training:\n", + "\n", + "```\n", + "Memory Requirements by Problem Scale:\n", + "\n", + "Small Problem (1K samples, 100 classes):\n", + " MSE: 8 KB (1K samples Γ— 8 bytes)\n", + " CrossEntropy: 800 KB (1K Γ— 100 Γ— 8 bytes)\n", + " Binary CE: 16 KB (1K Γ— 2 Γ— 8 bytes)\n", + "\n", + "Large Problem (100K samples, 10K classes):\n", + " MSE: 800 KB (independent of classes!)\n", + " CrossEntropy: 8 GB (memory bottleneck)\n", + " Binary CE: 1.6 MB (scales with samples only)\n", + "\n", + "Production Scale (1M samples, 50K vocab):\n", + " MSE: 8 MB\n", + " CrossEntropy: 400 GB (requires distributed memory)\n", + " Binary CE: 16 MB\n", + "```\n", + "\n", + "## Numerical Stability Engineering Analysis\n", + "\n", + "Production systems must handle edge cases robustly:\n", + "\n", + "```\n", + "Stability Challenges and Solutions:\n", + "\n", + "CrossEntropy Stability Issues:\n", + " Problem: exp(large_logit) β†’ overflow β†’ NaN gradients\n", + " Solution: log-sum-exp trick with max subtraction\n", + " \n", + " Problem: log(very_small_prob) β†’ -∞ β†’ training collapse\n", + " Solution: epsilon clipping (1e-15 to 1-1e-15)\n", + " \n", + "Binary CrossEntropy Stability Issues:\n", + " Problem: sigmoid(large_positive) β†’ 1.0 β†’ log(0) issues\n", + " Solution: stable logits formulation bypasses sigmoid\n", + " \n", + " Problem: exp(large_negative) in naive implementation\n", + " Solution: max(x,0) - x*y + log(1+exp(-|x|)) formulation\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "c48c075d", + "metadata": { + "lines_to_next_cell": 1 + }, + "source": [ + "\"\"\"\n", + "# Production Performance Benchmarks\n", + "\n", + "Real-world performance characteristics matter for deployment:\n", + "\n", + "```\n", + "Inference Throughput (measured on modern hardware):\n", + " MSE: ~100M predictions/second\n", + " CrossEntropy: ~10M predictions/second \n", + " Binary CrossEntropy: ~80M predictions/second\n", + "\n", + "Training Memory Bandwidth Requirements:\n", + " MSE: ~800 MB/s (lightweight computation)\n", + " CrossEntropy: ~80 GB/s (10Γ— higher due to softmax!)\n", + " Binary CE: ~1.6 GB/s (moderate requirements)\n", + "\n", + "Gradient Computation Overhead:\n", + " MSE: 1.1Γ— forward pass time (simple derivatives)\n", + " CrossEntropy: 1.5Γ— forward pass time (softmax gradients)\n", + " Binary CE: 1.2Γ— forward pass time (sigmoid gradients)\n", + "```\n", + "\n", + "# Framework Integration and Production Patterns\n", + "\n", + "Understanding how production systems implement these concepts:\n", + "\n", + "```\n", + "PyTorch Implementation Patterns:\n", + " torch.nn.MSELoss() - Direct implementation, minimal overhead\n", + " torch.nn.CrossEntropyLoss() - Fused softmax+CE for efficiency\n", + " torch.nn.BCEWithLogitsLoss() - Stable logits formulation\n", + " \n", + "TensorFlow Implementation Patterns:\n", + " tf.keras.losses.MeanSquaredError() - Vectorized operations\n", + " tf.keras.losses.SparseCategoricalCrossentropy() - Memory efficient\n", + " tf.keras.losses.BinaryCrossentropy() - From logits option\n", + " \n", + "Production Optimizations:\n", + " - Mixed precision (FP16) for memory efficiency\n", + " - Gradient accumulation for large batch simulation\n", + " - Loss scaling to prevent underflow in mixed precision\n", + " - Checkpointing to trade memory for computation\n", + "```\n", + "\n", + "# Edge Device and Deployment Considerations\n", + "\n", + "Loss function choice affects deployment feasibility:\n", + "\n", + "```\n", + "Edge Device Constraints:\n", + " Memory-limited (phones, IoT): Prefer Binary CE > MSE > CrossEntropy\n", + " CPU-only inference: MSE has best compute efficiency\n", + " Real-time requirements: Binary classification most predictable\n", + " \n", + "Distributed Training Challenges:\n", + " CrossEntropy: Requires all-reduce across all classes (expensive!)\n", + " Gradient accumulation: MSE linear, CrossEntropy non-linear dependencies\n", + " Mixed precision: Different overflow handling per loss type\n", + " \n", + "Monitoring and Debugging:\n", + " MSE divergence: Explodes quadratically (easy to detect)\n", + " CrossEntropy divergence: More gradual degradation \n", + " BCE monitoring: Natural bounded behavior aids debugging\n", + "```\n", + "\"\"\"\n", + "\n", + "πŸ” SYSTEMS INSIGHT: Performance Profiling Analysis\n", + "def analyze_loss_performance_characteristics():\n", + " \"\"\"Comprehensive performance analysis of all loss functions.\"\"\"\n", + " print(\"πŸ” Loss Function Performance Analysis\")\n", + " print(\"=\" * 45)\n", + " \n", + " try:\n", + " import time\n", + " \n", + " # Initialize loss functions\n", + " mse = MeanSquaredError()\n", + " ce = CrossEntropyLoss()\n", + " bce = BinaryCrossEntropyLoss()\n", + " \n", + " print(\"\\n⚑ Computational Complexity Measurement:\")\n", + " \n", + " # Test different batch sizes to see scaling behavior\n", + " batch_sizes = [100, 1000, 10000]\n", + " \n", + " for batch_size in batch_sizes:\n", + " print(f\"\\n Batch size: {batch_size:,}\")\n", + " \n", + " # MSE timing\n", + " mse_pred = Tensor(np.random.randn(batch_size, 10))\n", + " mse_true = Tensor(np.random.randn(batch_size, 10))\n", + " \n", + " start = time.perf_counter()\n", + " for _ in range(100): # Average over multiple runs\n", + " mse_loss = mse(mse_pred, mse_true)\n", + " mse_time = (time.perf_counter() - start) / 100\n", + " \n", + " # CrossEntropy timing\n", + " ce_pred = Tensor(np.random.randn(batch_size, 100)) # 100 classes\n", + " ce_true = Tensor(np.random.randint(0, 100, batch_size))\n", + " \n", + " start = time.perf_counter()\n", + " for _ in range(100):\n", + " ce_loss = ce(ce_pred, ce_true)\n", + " ce_time = (time.perf_counter() - start) / 100\n", + " \n", + " # Binary CrossEntropy timing\n", + " bce_pred = Tensor(np.random.randn(batch_size, 1))\n", + " bce_true = Tensor(np.random.randint(0, 2, (batch_size, 1)).astype(float))\n", + " \n", + " start = time.perf_counter()\n", + " for _ in range(100):\n", + " bce_loss = bce(bce_pred, bce_true)\n", + " bce_time = (time.perf_counter() - start) / 100\n", + " \n", + " print(f\" MSE: {mse_time*1000:8.3f} ms\")\n", + " print(f\" CrossEntropy: {ce_time*1000:8.3f} ms\")\n", + " print(f\" Binary CE: {bce_time*1000:8.3f} ms\")\n", + " print(f\" CE/MSE ratio: {ce_time/mse_time:8.1f}x\")\n", + " \n", + " print(\"\\nπŸ’Ύ Memory Efficiency Analysis:\")\n", + " \n", + " # Compare memory usage for different problem sizes\n", + " problem_configs = [\n", + " (\"Small (1K samples, 10 classes)\", 1000, 10),\n", + " (\"Medium (10K samples, 100 classes)\", 10000, 100),\n", + " (\"Large (100K samples, 1K classes)\", 100000, 1000)\n", + " ]\n", + " \n", + " for name, samples, classes in problem_configs:\n", + " print(f\"\\n {name}:\")\n", + " \n", + " # Memory calculations (bytes)\n", + " mse_memory = samples * 8 # One value per sample\n", + " ce_memory = samples * classes * 8 # Full probability distribution\n", + " bce_memory = samples * 8 # One probability per sample\n", + " \n", + " print(f\" MSE memory: {mse_memory / 1024 / 1024:8.1f} MB\")\n", + " print(f\" CE memory: {ce_memory / 1024 / 1024:8.1f} MB\") \n", + " print(f\" BCE memory: {bce_memory / 1024 / 1024:8.1f} MB\")\n", + " print(f\" CE overhead: {ce_memory/mse_memory:8.1f}x\")\n", + " \n", + " # πŸ’‘ WHY THIS MATTERS: These performance characteristics determine\n", + " # which loss functions are feasible for different deployment scenarios.\n", + " # CrossEntropy's O(nΓ—c) memory scaling makes it prohibitive for \n", + " # large vocabularies without specialized techniques.\n", + " \n", + " except Exception as e:\n", + " print(f\"⚠️ Performance analysis error: {e}\")\n", + " print(\"Performance analysis requires complete implementations\")\n", + "\n", + "πŸ” SYSTEMS INSIGHT: Numerical Stability Deep Analysis\n", + "def analyze_numerical_stability_edge_cases():\n", + " \"\"\"Deep analysis of numerical stability across all loss functions.\"\"\"\n", + " print(\"πŸ” Numerical Stability Edge Case Analysis\")\n", + " print(\"=\" * 50)\n", + " \n", + " try:\n", + " mse = MeanSquaredError()\n", + " ce = CrossEntropyLoss()\n", + " bce = BinaryCrossEntropyLoss()\n", + " \n", + " print(\"\\nπŸ›‘οΈ Extreme Value Stability Testing:\")\n", + " \n", + " # Test extreme values that could cause numerical issues\n", + " extreme_tests = [\n", + " (\"Huge positive\", 1e10),\n", + " (\"Huge negative\", -1e10),\n", + " (\"Tiny positive\", 1e-10),\n", + " (\"NaN input\", float('nan')),\n", + " (\"Infinity\", float('inf')),\n", + " (\"Negative infinity\", float('-inf'))\n", + " ]\n", + " \n", + " for name, value in extreme_tests:\n", + " print(f\"\\n Testing {name} ({value}):\")\n", + " \n", + " # MSE stability\n", + " try:\n", + " mse_loss = mse(Tensor([value]), Tensor([0.0]))\n", + " mse_stable = not (np.isnan(mse_loss.data) or np.isinf(mse_loss.data))\n", + " print(f\" MSE stable: {mse_stable} (loss: {mse_loss.data:.3e})\")\n", + " except:\n", + " print(f\" MSE stable: False (exception)\")\n", + " \n", + " # CrossEntropy stability \n", + " try:\n", + " ce_loss = ce(Tensor([[value, 0.0, 0.0]]), Tensor([0]))\n", + " ce_stable = not (np.isnan(ce_loss.data) or np.isinf(ce_loss.data))\n", + " print(f\" CE stable: {ce_stable} (loss: {ce_loss.data:.3e})\")\n", + " except:\n", + " print(f\" CE stable: False (exception)\")\n", + " \n", + " # Binary CrossEntropy stability\n", + " try:\n", + " bce_loss = bce(Tensor([value]), Tensor([1.0]))\n", + " bce_stable = not (np.isnan(bce_loss.data) or np.isinf(bce_loss.data))\n", + " print(f\" BCE stable: {bce_stable} (loss: {bce_loss.data:.3e})\")\n", + " except:\n", + " print(f\" BCE stable: False (exception)\")\n", + " \n", + " print(\"\\nπŸ”¬ Gradient Behavior Analysis:\")\n", + " \n", + " # Analyze gradient magnitudes for different prediction qualities\n", + " confidence_levels = [\n", + " (\"Very wrong\", [[-5.0, 5.0, 0.0]], [0]), # Predict class 1, actual class 0\n", + " (\"Slightly wrong\", [[-0.5, 0.5, 0.0]], [0]),\n", + " (\"Uncertain\", [[0.0, 0.0, 0.0]], [0]), \n", + " (\"Slightly right\", [[0.5, -0.5, 0.0]], [0]),\n", + " (\"Very right\", [[5.0, -5.0, 0.0]], [0])\n", + " ]\n", + " \n", + " print(\" Prediction Quality β†’ CrossEntropy Loss:\")\n", + " for name, logits, labels in confidence_levels:\n", + " loss = ce(Tensor(logits), Tensor(labels))\n", + " print(f\" {name:15}: {loss.data:8.4f}\")\n", + " \n", + " # πŸ’‘ WHY THIS MATTERS: Understanding how loss functions behave\n", + " # at extremes helps debug training failures and choose appropriate\n", + " # loss scaling and clipping strategies for production systems.\n", + " \n", + " except Exception as e:\n", + " print(f\"⚠️ Stability analysis error: {e}\")\n", + " print(\"Stability analysis requires complete implementations\")\n", + "\n", + "πŸ” SYSTEMS INSIGHT: Mixed Precision Training Analysis\n", + "def analyze_mixed_precision_considerations():\n", + " \"\"\"Analyze loss function behavior with FP16 mixed precision training.\"\"\"\n", + " print(\"πŸ” Mixed Precision Training Analysis\")\n", + " print(\"=\" * 40)\n", + "\n", + " try:\n", + " print(\"\\n⚑ FP16 Numerical Range Analysis:\")\n", + " print(\" FP16 range: ~Β±65,504 (much smaller than FP32's ~Β±3.4Γ—10³⁸)\")\n", + "\n", + " # Simulate FP16 range limitations\n", + " fp16_max = 65504.0\n", + " fp16_min_normal = 2**-14 # Smallest normal FP16 number β‰ˆ 6.1Γ—10⁻⁡\n", + "\n", + " print(f\" FP16 maximum: Β±{fp16_max:,.0f}\")\n", + " print(f\" FP16 min normal: {fp16_min_normal:.2e}\")\n", + " print(f\" Risk: Gradients/losses exceeding range β†’ infinity/NaN\")\n", + "\n", + " mse = MeanSquaredError()\n", + " ce = CrossEntropyLoss()\n", + " bce = BinaryCrossEntropyLoss()\n", + "\n", + " print(f\"\\n🎯 Loss Function Mixed Precision Compatibility:\")\n", + "\n", + " # Test cases that might overflow in FP16\n", + " test_cases = [\n", + " (\"Small values\", 1.0, 1.1),\n", + " (\"Medium values\", 100.0, 110.0),\n", + " (\"Large values\", 1000.0, 1100.0),\n", + " (\"FP16 edge\", 200.0, 250.0) # Could cause issues when squared\n", + " ]\n", + "\n", + " print(f\"\\n {'Test Case':>15} {'MSE Loss':>12} {'FP16 Safe?':>12}\")\n", + " print(f\" {'-'*15} {'-'*12} {'-'*12}\")\n", + "\n", + " for name, pred, true in test_cases:\n", + " mse_loss = mse(Tensor([pred]), Tensor([true]))\n", + " squared_error = (pred - true) ** 2\n", + " fp16_safe = squared_error < fp16_max\n", + "\n", + " print(f\" {name:>15} {mse_loss.data:>12.1f} {'βœ…' if fp16_safe else '❌':>12}\")\n", + "\n", + " print(f\"\\nπŸ›‘οΈ Mixed Precision Loss Scaling Strategy:\")\n", + "\n", + " # Demonstrate loss scaling concept\n", + " loss_scales = [1.0, 128.0, 1024.0, 8192.0]\n", + " base_loss = 0.01 # Small loss that might underflow\n", + "\n", + " print(f\" {'Scale Factor':>12} {'Scaled Loss':>12} {'FP16 Precision':>15}\")\n", + " print(f\" {'-'*12} {'-'*12} {'-'*15}\")\n", + "\n", + " for scale in loss_scales:\n", + " scaled_loss = base_loss * scale\n", + "\n", + " # Check if loss is representable in FP16\n", + " if scaled_loss > fp16_min_normal and scaled_loss < fp16_max:\n", + " precision = \"Good\"\n", + " elif scaled_loss <= fp16_min_normal:\n", + " precision = \"Underflow risk\"\n", + " else:\n", + " precision = \"Overflow risk\"\n", + "\n", + " print(f\" {scale:>12.0f} {scaled_loss:>12.3f} {precision:>15}\")\n", + "\n", + " print(f\"\\nβš–οΈ Loss Function Mixed Precision Recommendations:\")\n", + "\n", + " recommendations = [\n", + " (\"MSE\", \"Monitor for gradient explosion in high-dynamic-range problems\", \"Medium risk\"),\n", + " (\"CrossEntropy\", \"Use FP32 for softmax computation, FP16 for storage\", \"High risk\"),\n", + " (\"Binary CE\", \"Stable formulation handles FP16 well with proper scaling\", \"Low risk\")\n", + " ]\n", + "\n", + " for loss_type, recommendation, risk in recommendations:\n", + " print(f\" {loss_type:>12}: {recommendation} ({risk})\")\n", + "\n", + " print(f\"\\nπŸ”§ Implementation Best Practices for Mixed Precision:\")\n", + "\n", + " best_practices = [\n", + " \"1. Use automatic mixed precision (AMP) libraries that handle scaling\",\n", + " \"2. Keep loss computation in FP32, only cast inputs to FP16\",\n", + " \"3. Monitor for overflow/underflow during training\",\n", + " \"4. Use gradient clipping to prevent extreme gradients\",\n", + " \"5. Scale losses up during forward pass, scale gradients down during backward\"\n", + " ]\n", + "\n", + " for practice in best_practices:\n", + " print(f\" {practice}\")\n", + "\n", + " # Example mixed precision training pattern\n", + " print(f\"\\nπŸ’» Mixed Precision Training Pattern:\")\n", + " print(f\" ```python\")\n", + " print(f\" # Forward pass in FP16\")\n", + " print(f\" with autocast():\")\n", + " print(f\" predictions = model(inputs.half()) # FP16 inputs\")\n", + " print(f\" loss = loss_fn(predictions, targets) # Loss computed in FP32\")\n", + " print(f\" \")\n", + " print(f\" # Scale loss to prevent underflow\")\n", + " print(f\" scaled_loss = loss * scale_factor\")\n", + " print(f\" scaled_loss.backward()\")\n", + " print(f\" \")\n", + " print(f\" # Unscale gradients before optimizer step\")\n", + " print(f\" scaler.step(optimizer) # Automatically unscales gradients\")\n", + " print(f\" ```\")\n", + "\n", + " # πŸ’‘ WHY THIS MATTERS: Mixed precision training can provide 1.5-2Γ— speedup\n", + " # and 50% memory reduction, but loss functions must be carefully implemented\n", + " # to handle the reduced numerical precision without losing training stability.\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Mixed precision analysis error: {e}\")\n", + " print(\"Mixed precision analysis requires complete loss implementations\")\n", + "\n", + "πŸ” SYSTEMS INSIGHT: Production Deployment Analysis\n", + "def analyze_production_deployment_patterns():\n", + " \"\"\"Analyze how loss functions affect production ML system design.\"\"\"\n", + " print(\"πŸ” Production Deployment Pattern Analysis\")\n", + " print(\"=\" * 50)\n", + " \n", + " try:\n", + " print(\"\\nπŸš€ Deployment Scenario Analysis:\")\n", + " \n", + " # Different deployment scenarios with constraints\n", + " scenarios = [\n", + " {\n", + " \"name\": \"Mobile App (Spam Detection)\",\n", + " \"constraints\": \"Memory < 50MB, Latency < 100ms\",\n", + " \"problem\": \"Binary classification\",\n", + " \"recommendation\": \"Binary CrossEntropy\",\n", + " \"reasoning\": \"Minimal memory, fast inference, stable numerics\"\n", + " },\n", + " {\n", + " \"name\": \"Cloud API (Image Classification)\", \n", + " \"constraints\": \"Throughput > 1000 QPS, Cost optimization\",\n", + " \"problem\": \"1000-class classification\",\n", + " \"recommendation\": \"CrossEntropy with mixed precision\",\n", + " \"reasoning\": \"Can handle memory cost, needs throughput\"\n", + " },\n", + " {\n", + " \"name\": \"Edge IoT (Temperature Prediction)\",\n", + " \"constraints\": \"Memory < 1MB, Power < 1W\",\n", + " \"problem\": \"Regression\",\n", + " \"recommendation\": \"MSE with quantization\",\n", + " \"reasoning\": \"Minimal compute, no transcendental functions\"\n", + " },\n", + " {\n", + " \"name\": \"Large Language Model Training\",\n", + " \"constraints\": \"50K vocabulary, Multi-GPU\",\n", + " \"problem\": \"Next token prediction\",\n", + " \"recommendation\": \"Hierarchical Softmax or Sampling\",\n", + " \"reasoning\": \"Standard CrossEntropy too memory intensive\"\n", + " }\n", + " ]\n", + " \n", + " for scenario in scenarios:\n", + " print(f\"\\n πŸ“± {scenario['name']}:\")\n", + " print(f\" Constraints: {scenario['constraints']}\")\n", + " print(f\" Problem Type: {scenario['problem']}\")\n", + " print(f\" Best Loss: {scenario['recommendation']}\")\n", + " print(f\" Why: {scenario['reasoning']}\")\n", + " \n", + " print(\"\\nβš–οΈ Production Trade-off Analysis:\")\n", + " \n", + " trade_offs = [\n", + " (\"Memory Efficiency\", \"MSE > Binary CE >> CrossEntropy\"),\n", + " (\"Computational Speed\", \"MSE > Binary CE > CrossEntropy\"),\n", + " (\"Numerical Stability\", \"MSE β‰ˆ Binary CE > CrossEntropy\"), \n", + " (\"Implementation Complexity\", \"MSE > CrossEntropy > Binary CE\"),\n", + " (\"Gradient Quality\", \"CrossEntropy > Binary CE > MSE\"),\n", + " (\"Debug-ability\", \"MSE > Binary CE > CrossEntropy\")\n", + " ]\n", + " \n", + " for criterion, ranking in trade_offs:\n", + " print(f\" {criterion:20}: {ranking}\")\n", + " \n", + " print(\"\\nπŸ”§ Framework Integration Patterns:\")\n", + " \n", + " frameworks = [\n", + " (\"PyTorch\", \"nn.MSELoss(), nn.CrossEntropyLoss(), nn.BCEWithLogitsLoss()\"),\n", + " (\"TensorFlow\", \"keras.losses.MSE, SparseCategoricalCrossentropy, BinaryCrossentropy\"),\n", + " (\"JAX\", \"optax.l2_loss, optax.softmax_cross_entropy, optax.sigmoid_binary_cross_entropy\"),\n", + " (\"Production\", \"Custom implementations with monitoring and fallbacks\")\n", + " ]\n", + " \n", + " for framework, losses in frameworks:\n", + " print(f\" {framework:12}: {losses}\")\n", + " \n", + " # πŸ’‘ WHY THIS MATTERS: Loss function choice affects every aspect\n", + " # of ML system design - from memory requirements to latency to\n", + " # debugging complexity. Understanding these trade-offs enables\n", + " # informed architectural decisions for production systems.\n", + " \n", + " except Exception as e:\n", + " print(f\"⚠️ Deployment analysis error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1f0245d3", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "## πŸ€” ML Systems Thinking: Interactive Questions\n", + "\n", + "Now that you've implemented all core loss functions and analyzed their systems characteristics, let's explore their implications for real ML systems:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0789afbb", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "question-1-loss-selection", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "πŸ€” **Question 1: Loss Function Selection for Production Systems**\n", + "\n", + "You're building a production recommendation system that predicts user ratings (1-5 stars) for movies.\n", + "\n", + "Your team proposes three approaches:\n", + "A) Regression approach: Use MSE loss with continuous outputs (1.0-5.0)\n", + "B) Classification approach: Use CrossEntropy loss with 5 distinct classes \n", + "C) Ordinal approach: Use a custom loss that penalizes being off by multiple stars more heavily\n", + "\n", + "Analyze each approach considering your implementations:\n", + "\n", + "**Technical Analysis:**\n", + "- How does the memory scaling of CrossEntropy (O(batch_size Γ— num_classes)) affect this 5-class problem?\n", + "- What are the computational complexity differences between MSE's O(n) and CrossEntropy's O(nΓ—c) for c=5?\n", + "- How do the gradient behaviors differ? (MSE's quadratic vs CrossEntropy's logarithmic penalties)\n", + "\n", + "**Systems Implications:**\n", + "- Which approach would be most memory efficient for large batch training?\n", + "- How does numerical stability differ when handling edge cases (ratings at boundaries)?\n", + "- Which approach would have the most predictable inference latency?\n", + "\n", + "**Business Alignment:**\n", + "- How well does each loss function's penalty structure match the business objective?\n", + "- What happens with fractional ratings like 3.7? How would each approach handle this?\n", + "- Which approach would be easiest to monitor and debug in production?\n", + "\n", + "Recommend an approach with justification based on your implementation experience.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "583f52ea", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "question-2-numerical-stability", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "πŸ€” **Question 2: Debugging Numerical Stability in Production**\n", + "\n", + "Your cross-entropy loss function works perfectly in development, but in production you start seeing NaN losses that crash training after several hours.\n", + "\n", + "**Root Cause Analysis:**\n", + "Based on your implementation of the log-sum-exp trick and epsilon clipping:\n", + "1. What specific numerical computations in cross-entropy can produce NaN values?\n", + "2. Walk through how your `max_logits = np.max(prediction_logits, axis=1, keepdims=True)` prevents overflow\n", + "3. Explain why `np.clip(softmax_pred, epsilon, 1.0 - epsilon)` prevents underflow\n", + "4. What would happen if you removed epsilon clipping? Trace through the computation.\n", + "\n", + "**Production Debugging:**\n", + "Given millions of training examples, how would you:\n", + "1. Identify which specific inputs trigger the numerical instability?\n", + "2. Modify your CrossEntropy implementation to add monitoring without affecting performance?\n", + "3. Design fallback behavior when numerical issues are detected?\n", + "4. Validate that your fixes don't change the mathematical behavior for normal inputs?\n", + "\n", + "**Comparison Analysis:**\n", + "- How does your stable Binary CrossEntropy formulation `max(x,0) - x*y + log(1 + exp(-|x|))` prevent similar issues?\n", + "- Why is MSE generally more numerically stable than CrossEntropy?\n", + "- How would you modify loss functions for mixed precision (FP16) training where numerical ranges are more limited?\n", + "\n", + "Research how PyTorch and TensorFlow handle these same challenges in their loss implementations.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f65771b", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "question-3-custom-loss-design", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "πŸ€” **Question 3: Implementing and Optimizing Custom Loss Functions**\n", + "\n", + "You've seen examples of custom loss functions for business objectives. Now analyze implementation and optimization challenges:\n", + "\n", + "**Scenario Analysis:**\n", + "Choose one custom loss from the examples (Asymmetric BCE, Focal Loss, Ranking-Aware, Multi-Task, or Contrastive) and analyze:\n", + "\n", + "**Implementation Deep Dive:**\n", + "1. Trace through the numerical computation step-by-step for your chosen custom loss\n", + "2. Identify potential numerical stability issues compared to standard loss functions\n", + "3. How does the computational complexity compare to MSE/CrossEntropy/Binary CE?\n", + "4. What additional memory overhead does the custom formulation introduce?\n", + "\n", + "**Gradient Flow Analysis:**\n", + "5. How do the custom weighting schemes affect gradient magnitudes during backpropagation?\n", + "6. What happens to gradient flow when the custom weights become extreme (very large or very small)?\n", + "7. How would you detect and handle gradient explosion or vanishing in your custom loss?\n", + "8. Design gradient clipping strategies specific to your chosen custom loss function\n", + "\n", + "**Production Integration Challenges:**\n", + "9. How would you implement your custom loss to work with mixed precision training (FP16)?\n", + "10. What logging and monitoring would you add to track custom loss behavior in production?\n", + "11. How would you A/B test a custom loss against standard losses without affecting user experience?\n", + "12. Design a rollback strategy if the custom loss causes training instability\n", + "\n", + "**Performance Optimization:**\n", + "13. Identify computational bottlenecks in your chosen custom loss implementation\n", + "14. How could you vectorize operations to improve batch processing efficiency?\n", + "15. What caching strategies could reduce redundant computations?\n", + "16. How would you benchmark training speed impact compared to standard losses?\n", + "\n", + "**Business Validation Framework:**\n", + "17. Design metrics to validate that your custom loss actually improves business objectives\n", + "18. How would you separate loss function improvements from other training improvements?\n", + "19. What offline evaluation would you perform before deploying the custom loss?\n", + "20. How would you monitor for unexpected business metric changes after deployment?\n", + "\n", + "Implement one optimization for your chosen custom loss and explain how it addresses a specific production challenge.\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "4ed8ca84", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "## 🎯 MODULE SUMMARY: Loss Functions - Learning Objectives Made Mathematical\n", + "\n", + "Congratulations! You've successfully implemented the complete foundation for neural network training objectives:\n", + "\n", + "### What You've Accomplished\n", + "βœ… **Complete Loss Function Library**: MSE for regression, CrossEntropy for multi-class classification, and Binary CrossEntropy for binary classification with production-grade numerical stability\n", + "βœ… **Systems Engineering Understanding**: Deep comprehension of computational complexity, memory scaling, and numerical stability requirements for reliable ML systems\n", + "βœ… **Mathematical Implementation Mastery**: Built loss functions from mathematical foundations through stable computational formulations to working code\n", + "βœ… **Production Readiness Knowledge**: Understanding of how loss function choice affects training speed, memory usage, and deployment feasibility\n", + "βœ… **Framework Integration Insight**: Clear connection between your implementations and how PyTorch/TensorFlow solve the same problems\n", + "\n", + "### Key Learning Outcomes\n", + "- **Loss Function Theory**: How mathematical loss functions translate business objectives into optimization targets that neural networks can learn from\n", + "- **Numerical Stability Engineering**: Critical importance of stable implementations that prevent catastrophic training failures in production systems\n", + "- **Systems Performance Analysis**: Understanding of computational complexity, memory scaling, and performance trade-offs that affect production deployment\n", + "- **Production ML Patterns**: Knowledge of how loss function choice affects system architecture, monitoring requirements, and debugging complexity\n", + "\n", + "### Mathematical Foundations Mastered \n", + "- **MSE computation**: `(1/n) Γ— Ξ£(y_pred - y_true)Β²` with smooth quadratic gradients for regression optimization\n", + "- **CrossEntropy with stable softmax**: Log-sum-exp trick and epsilon clipping for numerically robust classification\n", + "- **Binary CrossEntropy stability**: `max(x,0) - xΓ—y + log(1 + exp(-|x|))` formulation preventing overflow/underflow issues\n", + "- **Gradient behavior understanding**: How different loss functions create different optimization landscapes and learning dynamics\n", + "\n", + "### Professional Skills Developed\n", + "- **Production-quality implementation**: Robust numerical stability measures that prevent training failures with real-world data\n", + "- **Performance optimization**: Understanding of computational and memory complexity that affects scalability and deployment\n", + "- **Systems debugging**: Knowledge of how to identify and fix numerical stability issues in production ML systems\n", + "- **Framework integration**: Clear understanding of how your implementations connect to professional ML development workflows\n", + "\n", + "### Ready for Advanced Applications\n", + "Your loss function implementations now enable:\n", + "- **Complete training loops** that optimize neural networks on real datasets with proper convergence monitoring\n", + "- **Custom loss functions** that align with specific business objectives and domain requirements\n", + "- **Production deployment** with confidence in numerical stability and performance characteristics\n", + "- **Advanced optimization** techniques that build on solid loss function foundations\n", + "\n", + "### Connection to Real ML Systems\n", + "Your implementations mirror the essential patterns used in:\n", + "- **PyTorch's loss functions**: Same mathematical formulations with identical numerical stability measures\n", + "- **TensorFlow's losses**: Equivalent computational patterns and production-grade error handling\n", + "- **Production ML pipelines**: The exact loss functions that power real ML systems at companies like Google, Meta, and OpenAI\n", + "- **Research frameworks**: Foundation for experimenting with novel loss functions and training objectives\n", + "\n", + "### Next Steps\n", + "With solid loss function implementations, you're ready to:\n", + "1. **Export your module**: `tito module complete 04_losses`\n", + "2. **Validate integration**: `tito test --module losses`\n", + "3. **Explore autograd integration**: See how loss functions connect with automatic differentiation\n", + "4. **Ready for Module 06**: Build automatic gradient computation that makes loss-based learning possible!\n", + "\n", + "**Your achievement**: You've built the mathematical foundation that transforms predictions into learning signals - the critical bridge between model outputs and optimization objectives that makes neural network training possible!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfc087a8", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "final-demo", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " print(\"πŸ”₯ TinyTorch Loss Functions Module - Complete Demo\")\n", + " print(\"=\" * 55)\n", + " \n", + " # Test all core implementations\n", + " print(\"\\nπŸ§ͺ Testing All Loss Functions:\")\n", + " test_unit_mse_loss()\n", + " test_unit_crossentropy_loss()\n", + " test_unit_binary_crossentropy_loss()\n", + " test_unit_comprehensive_loss_integration()\n", + " \n", + " # Run systems analysis functions\n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"πŸ” Systems Analysis Functions\")\n", + " print(\"=\" * 30)\n", + "\n", + " visualize_loss_landscapes()\n", + " analyze_mse_properties()\n", + " analyze_crossentropy_stability()\n", + " analyze_binary_crossentropy_efficiency()\n", + " analyze_mixed_precision_considerations()\n", + " analyze_loss_performance_characteristics()\n", + " analyze_numerical_stability_edge_cases()\n", + " analyze_production_deployment_patterns()\n", + " \n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"πŸ“Š Loss Function Usage Examples\")\n", + " print(\"=\" * 35)\n", + " \n", + " # Example 1: Regression with MSE\n", + " print(\"\\n1. Regression Example (Predicting House Prices):\")\n", + " mse = MeanSquaredError()\n", + " house_predictions = Tensor([[250000, 180000, 320000]]) # Predicted prices\n", + " house_actual = Tensor([[240000, 175000, 315000]]) # Actual prices\n", + " regression_loss = mse(house_predictions, house_actual)\n", + " print(f\" House price prediction loss: ${regression_loss.data:,.0f}Β² average error\")\n", + " \n", + " # Example 2: Multi-class classification with CrossEntropy\n", + " print(\"\\n2. Multi-Class Classification Example (Image Recognition):\")\n", + " ce = CrossEntropyLoss()\n", + " image_logits = Tensor([[2.1, 0.5, -0.3, 1.8, 0.1], # Model outputs for 5 classes\n", + " [-0.2, 3.1, 0.8, -1.0, 0.4]]) # (cat, dog, bird, fish, rabbit)\n", + " true_classes = Tensor([0, 1]) # First image = cat, second = dog\n", + " classification_loss = ce(image_logits, true_classes)\n", + " print(f\" Image classification loss: {classification_loss.data:.4f}\")\n", + " \n", + " # Example 3: Binary classification with BCE\n", + " print(\"\\n3. Binary Classification Example (Spam Detection):\")\n", + " bce = BinaryCrossEntropyLoss()\n", + " spam_logits = Tensor([[1.2], [-0.8], [2.1], [-1.5]]) # Spam prediction logits\n", + " spam_labels = Tensor([[1.0], [0.0], [1.0], [0.0]]) # 1=spam, 0=not spam\n", + " spam_loss = bce(spam_logits, spam_labels)\n", + " print(f\" Spam detection loss: {spam_loss.data:.4f}\")\n", + " \n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"🎯 Loss Function Characteristics\")\n", + " print(\"=\" * 35)\n", + " \n", + " # Compare perfect vs imperfect predictions\n", + " print(\"\\nπŸ“Š Perfect vs Random Predictions:\")\n", + " \n", + " # Perfect predictions\n", + " perfect_mse = mse(Tensor([5.0]), Tensor([5.0]))\n", + " perfect_ce = ce(Tensor([[10.0, 0.0, 0.0]]), Tensor([0]))\n", + " perfect_bce = bce(Tensor([10.0]), Tensor([1.0]))\n", + " \n", + " print(f\" Perfect MSE loss: {perfect_mse.data:.6f}\")\n", + " print(f\" Perfect CE loss: {perfect_ce.data:.6f}\")\n", + " print(f\" Perfect BCE loss: {perfect_bce.data:.6f}\")\n", + " \n", + " # Random predictions\n", + " random_mse = mse(Tensor([3.0]), Tensor([5.0])) # Off by 2\n", + " random_ce = ce(Tensor([[0.0, 0.0, 0.0]]), Tensor([0])) # Uniform distribution\n", + " random_bce = bce(Tensor([0.0]), Tensor([1.0])) # 50% confidence\n", + " \n", + " print(f\" Random MSE loss: {random_mse.data:.6f}\")\n", + " print(f\" Random CE loss: {random_ce.data:.6f}\")\n", + " print(f\" Random BCE loss: {random_bce.data:.6f}\")\n", + " \n", + " print(\"\\nπŸŽ‰ Complete loss function foundation ready!\")\n", + " print(\" βœ… MSE for regression problems\")\n", + " print(\" βœ… CrossEntropy for multi-class classification\")\n", + " print(\" βœ… Binary CrossEntropy for binary classification\")\n", + " print(\" βœ… Numerically stable implementations\")\n", + " print(\" βœ… Production-ready batch processing\")\n", + " print(\" βœ… Systems analysis and performance insights\")\n", + " print(\" βœ… Ready for neural network training!\")" + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modules/04_losses/losses_dev.py b/modules/04_losses/losses_dev.py index 149b1bbc..9bb2463c 100644 --- a/modules/04_losses/losses_dev.py +++ b/modules/04_losses/losses_dev.py @@ -72,7 +72,7 @@ try: # In a complete system, these would integrate with the autograd Variable system except ImportError: # For development, import from local modules - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor # %% nbgrader={"grade": false, "grade_id": "losses-setup", "locked": false, "schema_version": 3, "solution": false, "task": false} @@ -85,7 +85,7 @@ print("Ready to build loss functions for neural network training!") """ ## Where This Code Lives in the Final Package -**Learning Side:** You work in modules/05_losses/losses_dev.py +**Learning Side:** You work in modules/04_losses/losses_dev.py **Building Side:** Code exports to tinytorch.core.losses ```python @@ -2081,7 +2081,7 @@ Your implementations mirror the essential patterns used in: ### Next Steps With solid loss function implementations, you're ready to: -1. **Export your module**: `tito module complete 05_losses` +1. **Export your module**: `tito module complete 04_losses` 2. **Validate integration**: `tito test --module losses` 3. **Explore autograd integration**: See how loss functions connect with automatic differentiation 4. **Ready for Module 06**: Build automatic gradient computation that makes loss-based learning possible! diff --git a/modules/07_training/training_dev.py b/modules/07_training/training_dev.py index 0cbf39d4..c825f5cf 100644 --- a/modules/07_training/training_dev.py +++ b/modules/07_training/training_dev.py @@ -51,9 +51,9 @@ import time import pickle # Add module directories to Python path -sys.path.append(os.path.abspath('modules/source/02_tensor')) -sys.path.append(os.path.abspath('modules/source/03_activations')) -sys.path.append(os.path.abspath('modules/source/04_layers')) +sys.path.append(os.path.abspath('modules/source/01_tensor')) +sys.path.append(os.path.abspath('modules/source/02_activations')) +sys.path.append(os.path.abspath('modules/source/03_layers')) sys.path.append(os.path.abspath('modules/source/05_networks')) sys.path.append(os.path.abspath('modules/source/06_autograd')) sys.path.append(os.path.abspath('modules/source/07_spatial')) @@ -67,14 +67,34 @@ sys.path.append(os.path.abspath('modules/source/09_dataloader')) # No longer needed # Import all the building blocks we need -from tinytorch.core.tensor import Tensor -from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax -from tinytorch.core.layers import Linear -from tinytorch.core.networks import Sequential, create_mlp -from tinytorch.core.spatial import Conv2D, flatten -from tinytorch.utils.data import Dataset, DataLoader -from tinytorch.core.autograd import Variable # FOR AUTOGRAD INTEGRATION -from tinytorch.core.optimizers import SGD, Adam +try: + from tinytorch.core.tensor import Tensor + from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax + from tinytorch.core.layers import Linear + from tinytorch.core.networks import Sequential, create_mlp + from tinytorch.core.spatial import Conv2D, flatten + from tinytorch.utils.data import Dataset, DataLoader + from tinytorch.core.autograd import Variable # FOR AUTOGRAD INTEGRATION + from tinytorch.core.optimizers import SGD, Adam +except ImportError: + # For development - import from local modules + import sys + import os + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_activations')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '03_layers')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '06_optimizers')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '08_spatial')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '09_dataloader')) + + from tensor_dev import Tensor + from activations_dev import ReLU, Sigmoid, Tanh, Softmax + from layers_dev import Linear, Sequential, create_mlp + from spatial_dev import Conv2D, flatten + from dataloader_dev import Dataset, DataLoader + from autograd_dev import Variable + from optimizers_dev import SGD, Adam # πŸ”₯ AUTOGRAD INTEGRATION: Loss functions now return Variables that support .backward() # This enables automatic gradient computation for neural network training! diff --git a/modules/08_spatial/spatial_dev.py b/modules/08_spatial/spatial_dev.py index b3594398..0ba4a036 100644 --- a/modules/08_spatial/spatial_dev.py +++ b/modules/08_spatial/spatial_dev.py @@ -57,9 +57,9 @@ try: except ImportError: # Development mode - import from local module files sys.path.extend([ - os.path.join(os.path.dirname(__file__), '..', '02_tensor'), - os.path.join(os.path.dirname(__file__), '..', '03_activations'), - os.path.join(os.path.dirname(__file__), '..', '04_layers') + os.path.join(os.path.dirname(__file__), '..', '01_tensor'), + os.path.join(os.path.dirname(__file__), '..', '02_activations'), + os.path.join(os.path.dirname(__file__), '..', '03_layers') ]) from tensor_dev import Tensor, Parameter from activations_dev import ReLU diff --git a/modules/10_tokenization/tokenization_dev.py b/modules/10_tokenization/tokenization_dev.py index e081e706..ec3b1f3b 100644 --- a/modules/10_tokenization/tokenization_dev.py +++ b/modules/10_tokenization/tokenization_dev.py @@ -55,7 +55,7 @@ try: from tinytorch.core.tensor import Tensor except ImportError: # For development, import from local tensor module - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor # %% nbgrader={"grade": false, "grade_id": "tokenization-welcome", "locked": false, "schema_version": 3, "solution": false, "task": false} diff --git a/modules/11_embeddings/embeddings_dev.py b/modules/11_embeddings/embeddings_dev.py index e2a2d0e4..82ad6f5f 100644 --- a/modules/11_embeddings/embeddings_dev.py +++ b/modules/11_embeddings/embeddings_dev.py @@ -54,7 +54,7 @@ try: from tinytorch.core.tensor import Tensor except ImportError: # For development, import from local tensor module - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor # Try to import tokenization classes diff --git a/modules/12_attention/attention_dev.py b/modules/12_attention/attention_dev.py index 07e0b945..f79a4d83 100644 --- a/modules/12_attention/attention_dev.py +++ b/modules/12_attention/attention_dev.py @@ -60,7 +60,7 @@ try: from tinytorch.core.tensor import Tensor except ImportError: # For development, import from local tensor module - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor # Try to import embedding classes diff --git a/modules/13_transformers/transformers_dev.py b/modules/13_transformers/transformers_dev.py index a0c87bb1..94997f91 100644 --- a/modules/13_transformers/transformers_dev.py +++ b/modules/13_transformers/transformers_dev.py @@ -57,7 +57,7 @@ def _import_from_module_dev(module_name, class_names): module_path = os.path.join(os.path.dirname(__file__), '..', module_name) sys.path.insert(0, module_path) try: - if module_name == '02_tensor': + if module_name == '01_tensor': from tensor_dev import Tensor return {'Tensor': Tensor} elif module_name == '13_attention': @@ -81,7 +81,7 @@ if 'tinytorch' in sys.modules: from tinytorch.core.embeddings import Embedding, PositionalEncoding else: # Development: Import from local modules - tensor_imports = _import_from_module_dev('02_tensor', ['Tensor']) + tensor_imports = _import_from_module_dev('01_tensor', ['Tensor']) Tensor = tensor_imports['Tensor'] attention_imports = _import_from_module_dev('13_attention', diff --git a/modules/16_quantization/quantization_dev.py b/modules/16_quantization/quantization_dev.py index a1db5829..7f5ede84 100644 --- a/modules/16_quantization/quantization_dev.py +++ b/modules/16_quantization/quantization_dev.py @@ -65,7 +65,7 @@ try: from tinytorch.core.spatial import Conv2d, MaxPool2D except ImportError: # For development, import from local modules - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) sys.path.append(os.path.join(os.path.dirname(__file__), '..', '06_spatial')) try: from tensor_dev import Tensor diff --git a/modules/18_caching/caching_dev.py b/modules/18_caching/caching_dev.py index 1583dd5b..28fe94a4 100644 --- a/modules/18_caching/caching_dev.py +++ b/modules/18_caching/caching_dev.py @@ -57,7 +57,7 @@ try: from tinytorch.core.tensor import Tensor except ImportError: # For development, import from local tensor module - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor # Try to import attention classes diff --git a/progress.json b/progress.json index f9bf5e98..c55c4699 100644 --- a/progress.json +++ b/progress.json @@ -1,11 +1,16 @@ { "completed_modules": [ - "01" + "01", + "02", + "03", + "06", + "08" ], - "last_completed": "01", - "last_updated": "2025-09-28T07:57:44.694673", + "last_completed": "08", + "last_updated": "2025-09-28T08:07:12.088651", "started_modules": [ - "01" + "01", + "04" ], - "last_worked": "01" + "last_worked": "04" } \ No newline at end of file