From 3036ef74ef6533bb7a45a583a69fca68a7bf81a1 Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Mon, 29 Sep 2025 10:54:14 -0400
Subject: [PATCH] Fix module dependency ordering - no forward references

- Parameter class now works with basic Tensors initially, upgrades to Variables when autograd available
- Loss functions work with basic tensor operations before autograd module
- Each module can now be built and tested sequentially without needing future modules
- Modules 01-04 work with basic Tensors only
- Module 05 introduces autograd, then earlier modules get gradient capabilities
- Restored proper pedagogical flow for incremental learning
---
 modules/03_layers/layers_dev.py | 123 ++++++++++++-------
 modules/04_losses/losses_dev.py | 206 ++++++++++++++++++++++----------
 2 files changed, 222 insertions(+), 107 deletions(-)

diff --git a/modules/03_layers/layers_dev.py b/modules/03_layers/layers_dev.py
index 9f417e8a..795fcfab 100644
--- a/modules/03_layers/layers_dev.py
+++ b/modules/03_layers/layers_dev.py
@@ -77,75 +77,116 @@ else:
     finally:
         sys.path.pop(0)  # Always clean up path to avoid side effects
 
-# CRITICAL FIX: Parameter must be Variable-based for gradient tracking
 class Parameter:
     """
-    A trainable parameter that supports automatic differentiation.
+    A trainable parameter that wraps a Tensor and supports gradient tracking.
 
-    This creates a Variable with requires_grad=True for use as neural network parameters.
-    Essential for gradient-based optimization of weights and biases.
+    Initially works with basic Tensors only (modules 01-04).
+    After module 05 (autograd), gets enhanced with automatic differentiation.
 
-    IMPORTANT: Parameters must participate in autograd for training to work.
+    This staged approach allows students to build and test layers before learning autograd.
     """
     def __init__(self, data):
-        # Import Variable locally to avoid circular imports
+        if isinstance(data, Tensor):
+            self._tensor = data
+        else:
+            # Convert numpy array or list to Tensor
+            self._tensor = Tensor(data)
+
+        # Initially no gradient tracking - will be enhanced after autograd module
+        self._grad = None
+        self._requires_grad = True  # Mark as trainable for future enhancement
+
+        # Try to upgrade to Variable if autograd is available (after module 05)
+        self._try_upgrade_to_variable()
+
+    def _try_upgrade_to_variable(self):
+        """Attempt to upgrade to Variable if autograd is available."""
         try:
+            # Try importing Variable (will work after module 05)
             from tinytorch.core.autograd import Variable
+
+            # Upgrade to Variable for gradient tracking
+            self._variable = Variable(self._tensor.data, requires_grad=True)
+            self._is_variable = True
         except ImportError:
-            # For development, import from local module
-            import sys
-            import os
-            sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
-            from autograd_dev import Variable
-
-        # Create Variable with gradient tracking enabled
-        if isinstance(data, Variable):
-            self._variable = data
-            if not data.requires_grad:
-                # Ensure parameters always require gradients
-                self._variable.requires_grad = True
-        else:
-            # Convert data to Variable with gradient tracking
-            self._variable = Variable(data, requires_grad=True)
-
-    def __getattr__(self, name):
-        """Delegate all attribute access to the underlying Variable."""
-        return getattr(self._variable, name)
-
-    def __setattr__(self, name, value):
-        """Handle setting attributes."""
-        if name == '_variable':
-            super().__setattr__(name, value)
-        else:
-            # Delegate to underlying Variable
-            setattr(self._variable, name, value)
+            # Autograd not yet available - stay as basic Parameter with Tensor
+            self._variable = None
+            self._is_variable = False
 
     @property
     def data(self):
         """Access to underlying data."""
-        return self._variable.data
+        if self._is_variable:
+            return self._variable.data
+        else:
+            return self._tensor.data
+
+    @property
+    def shape(self):
+        """Shape of the parameter tensor."""
+        if self._is_variable:
+            return self._variable.data.shape
+        else:
+            return self._tensor.shape
 
     @property
     def grad(self):
-        """Access to gradient."""
-        return self._variable.grad
+        """Access to gradient (None if autograd not available yet)."""
+        if self._is_variable:
+            return self._variable.grad
+        else:
+            return self._grad  # Will be None initially
 
     @grad.setter
     def grad(self, value):
         """Set gradient."""
-        self._variable.grad = value
+        if self._is_variable:
+            self._variable.grad = value
+        else:
+            self._grad = value
 
     @property
     def requires_grad(self):
         """Whether this parameter requires gradients."""
-        return self._variable.requires_grad
+        if self._is_variable:
+            return self._variable.requires_grad
+        else:
+            return self._requires_grad
 
     def backward(self, gradient=None):
-        """Backpropagate gradients."""
-        return self._variable.backward(gradient)
+        """Backpropagate gradients (only works after autograd module)."""
+        if self._is_variable:
+            return self._variable.backward(gradient)
+        else:
+            raise NotImplementedError("Gradient computation requires autograd module (module 05)")
+
+    def __add__(self, other):
+        """Addition operation."""
+        if self._is_variable:
+            return self._variable + other
+        else:
+            return self._tensor + other
+
+    def __mul__(self, other):
+        """Multiplication operation."""
+        if self._is_variable:
+            return self._variable * other
+        else:
+            return self._tensor * other
+
+    def __matmul__(self, other):
+        """Matrix multiplication."""
+        if self._is_variable:
+            return self._variable @ other
+        else:
+            return self._tensor @ other
 
     def __repr__(self):
-        return f"Parameter({self._variable})"
+        if self._is_variable:
+            return f"Parameter({self._variable})"
+        else:
+            return f"Parameter(Tensor({self._tensor.data.shape}), requires_grad={self._requires_grad})"
 
 # In[ ]:
 
diff --git a/modules/04_losses/losses_dev.py b/modules/04_losses/losses_dev.py
index 5c021137..2d1dd4d1 100644
--- a/modules/04_losses/losses_dev.py
+++ b/modules/04_losses/losses_dev.py
@@ -63,18 +63,75 @@ import numpy as np
 import sys
 import os
 
-# Import our building blocks - try package first, then local modules
+# Import our building blocks - Tensor first, autograd operations if available
 try:
     from tinytorch.core.tensor import Tensor
-    from tinytorch.core.autograd import Variable, subtract, multiply, add, matmul
-    # CRITICAL: Now using full autograd integration for proper gradient flow
-    # These losses will work with the autograd computational graph
 except ImportError:
     # For development, import from local modules
     sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
     from tensor_dev import Tensor
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
-    from autograd_dev import Variable, subtract, multiply, add, matmul
+
+# Try to import autograd operations if available (after module 05)
+# Initially losses work with basic tensors, get enhanced with autograd later
+_autograd_available = False
+try:
+    from tinytorch.core.autograd import Variable, subtract, multiply, add, matmul
+    _autograd_available = True
+except ImportError:
+    # Try development import
+    try:
+        sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
+        from autograd_dev import Variable, subtract, multiply, add, matmul
+        _autograd_available = True
+    except ImportError:
+        # Autograd not available yet - losses will work with basic tensor operations
+        # This is the expected case for modules 01-04
+        _autograd_available = False
+
+        # Define basic operations for tensors (will be replaced by autograd versions later)
+        def subtract(a, b):
+            """Basic subtraction for tensors (before autograd)."""
+            if hasattr(a, 'data') and hasattr(b, 'data'):
+                return Tensor(a.data - b.data)
+            elif hasattr(a, 'data'):
+                return Tensor(a.data - b)
+            elif hasattr(b, 'data'):
+                return Tensor(a - b.data)
+            else:
+                return Tensor(a - b)
+
+        def multiply(a, b):
+            """Basic multiplication for tensors (before autograd)."""
+            if hasattr(a, 'data') and hasattr(b, 'data'):
+                return Tensor(a.data * b.data)
+            elif hasattr(a, 'data'):
+                return Tensor(a.data * b)
+            elif hasattr(b, 'data'):
+                return Tensor(a * b.data)
+            else:
+                return Tensor(a * b)
+
+        def add(a, b):
+            """Basic addition for tensors (before autograd)."""
+            if hasattr(a, 'data') and hasattr(b, 'data'):
+                return Tensor(a.data + b.data)
+            elif hasattr(a, 'data'):
+                return Tensor(a.data + b)
+            elif hasattr(b, 'data'):
+                return Tensor(a + b.data)
+            else:
+                return Tensor(a + b)
+
+        def matmul(a, b):
+            """Basic matrix multiplication for tensors (before autograd)."""
+            if hasattr(a, 'data') and hasattr(b, 'data'):
+                return Tensor(a.data @ b.data)
+            elif hasattr(a, 'data'):
+                return Tensor(a.data @ b)
+            elif hasattr(b, 'data'):
+                return Tensor(a @ b.data)
+            else:
+                return Tensor(a @ b)
 
 # %% nbgrader={"grade": false, "grade_id": "losses-setup", "locked": false, "schema_version": 3, "solution": false, "task": false}
 print("FIRE TinyTorch Loss Functions Module")
@@ -2208,11 +2265,11 @@ to enable proper backpropagation through the computational graph.
 #| export
 class MSELoss:
     """
-    Mean Squared Error Loss with Autograd Integration
+    Mean Squared Error Loss - Works with both Tensors and Variables
 
-    This version properly integrates with the autograd system to enable
-    gradient flow during backpropagation. Unlike the basic MeanSquaredError
-    above, this returns a Variable that participates in the computational graph.
+    Initially works with basic Tensors (modules 01-04).
+    Automatically upgrades to use Variables when autograd is available (module 05+).
+    This staged approach allows testing loss functions before learning automatic differentiation.
     """
 
     def __init__(self):
@@ -2221,44 +2278,55 @@ class MSELoss:
 
     def __call__(self, predictions, targets):
         """
-        Compute MSE loss with autograd support.
+        Compute MSE loss.
 
         Args:
-            predictions: Model predictions (Variable or convertible to Variable)
-            targets: True targets (Variable or convertible to Variable)
+            predictions: Model predictions (Tensor/Variable)
+            targets: True targets (Tensor/Variable)
 
         Returns:
-            Variable with scalar loss value and gradient tracking
+            Scalar loss value (Tensor initially, Variable after autograd)
         """
-        # Ensure inputs are Variables for gradient tracking
-        if not isinstance(predictions, Variable):
+        if _autograd_available:
+            # Autograd available - use Variables for gradient tracking
+            if not isinstance(predictions, Variable):
+                pred_data = predictions.data if hasattr(predictions, 'data') else predictions
+                predictions = Variable(pred_data, requires_grad=False)
+
+            if not isinstance(targets, Variable):
+                target_data = targets.data if hasattr(targets, 'data') else targets
+                targets = Variable(target_data, requires_grad=False)
+
+            # Compute MSE using autograd operations
+            diff = subtract(predictions, targets)
+            squared_diff = multiply(diff, diff)
+
+            # Sum all elements and divide by count to get mean
+            loss = Variable.sum(squared_diff)
+
+            # Convert to mean (divide by number of elements)
+            batch_size = predictions.data.data.size
+            mean_loss = multiply(loss, 1.0 / batch_size)
+        else:
+            # Basic tensor operations - no gradient tracking yet
             pred_data = predictions.data if hasattr(predictions, 'data') else predictions
-            predictions = Variable(pred_data, requires_grad=False)
-
-        if not isinstance(targets, Variable):
             target_data = targets.data if hasattr(targets, 'data') else targets
-            targets = Variable(target_data, requires_grad=False)
 
-        # Compute MSE using autograd operations
-        diff = subtract(predictions, targets)
-        squared_diff = multiply(diff, diff)
-
-        # Sum all elements and divide by count to get mean
-        loss = Variable.sum(squared_diff)
-
-        # Convert to mean (divide by number of elements)
-        batch_size = predictions.data.data.size
-        mean_loss = multiply(loss, 1.0 / batch_size)
+            # Compute MSE using numpy operations
+            diff = pred_data - target_data
+            squared_diff = diff * diff
+            mean_loss = Tensor(np.mean(squared_diff))
 
         return mean_loss
 
 #| export
 class CrossEntropyLoss:
     """
-    Cross-Entropy Loss with Autograd Integration
+    Cross-Entropy Loss - Works with both Tensors and Variables
 
-    Simplified cross-entropy that works with the autograd system.
-    For training neural networks with gradient-based optimization.
+    Initially works with basic Tensors (modules 01-04).
+    Automatically upgrades to use Variables when autograd is available (module 05+).
+    This staged approach allows testing loss functions before learning automatic differentiation.
     """
 
     def __init__(self):
@@ -2267,27 +2335,29 @@ class CrossEntropyLoss:
 
     def __call__(self, predictions, targets):
         """
-        Compute cross-entropy loss with autograd support.
+        Compute cross-entropy loss.
 
         Args:
-            predictions: Model predictions/logits (Variable)
-            targets: True class indices (Variable or numpy array)
+            predictions: Model predictions/logits (Tensor/Variable)
+            targets: True class indices (Tensor/Variable or numpy array)
 
         Returns:
-            Variable with scalar loss value and gradient tracking
+            Scalar loss value (Tensor initially, Variable after autograd)
         """
-        # Handle Variable inputs
-        if isinstance(predictions, Variable):
-            pred_data = predictions.data.data
-        elif hasattr(predictions, 'data'):
-            pred_data = predictions.data
+        # Extract raw data from inputs
+        if hasattr(predictions, 'data'):
+            if hasattr(predictions.data, 'data'):  # Variable with nested data
+                pred_data = predictions.data.data
+            else:  # Tensor with data
+                pred_data = predictions.data
         else:
             pred_data = predictions
 
-        if isinstance(targets, Variable):
-            target_data = targets.data.data
-        elif hasattr(targets, 'data'):
-            target_data = targets.data
+        if hasattr(targets, 'data'):
+            if hasattr(targets.data, 'data'):  # Variable with nested data
+                target_data = targets.data.data
+            else:  # Tensor with data
+                target_data = targets.data
         else:
             target_data = targets
 
@@ -2311,27 +2381,31 @@ class CrossEntropyLoss:
             # One-hot labels
             loss = -np.mean(np.sum(target_data * np.log(softmax_pred), axis=-1))
 
-        # Return as Variable with gradient function
-        result = Variable(loss, requires_grad=True)
+        if _autograd_available:
+            # Return as Variable with gradient function
+            result = Variable(loss, requires_grad=True)
 
-        # Define backward function for proper gradient flow
-        def grad_fn(gradient):
-            if isinstance(predictions, Variable) and predictions.requires_grad:
-                batch_size = pred_data.shape[0]
+            # Define backward function for proper gradient flow
+            def grad_fn(gradient):
+                if isinstance(predictions, Variable) and predictions.requires_grad:
+                    batch_size = pred_data.shape[0]
 
-                # Gradient of cross-entropy with softmax
-                if len(target_data.shape) == 1 or target_data.shape[-1] == 1:
-                    # Integer labels - gradient is (softmax - one_hot_targets)
-                    grad = softmax_pred.copy()
-                    for i in range(batch_size):
-                        label = int(target_data[i])
-                        grad[i, label] -= 1
-                    grad = grad / batch_size * gradient  # Scale by incoming gradient
-                else:
-                    # One-hot labels
-                    grad = (softmax_pred - target_data) / batch_size * gradient
+                    # Gradient of cross-entropy with softmax
+                    if len(target_data.shape) == 1 or target_data.shape[-1] == 1:
+                        # Integer labels - gradient is (softmax - one_hot_targets)
+                        grad = softmax_pred.copy()
+                        for i in range(batch_size):
+                            label = int(target_data[i])
+                            grad[i, label] -= 1
+                        grad = grad / batch_size * gradient  # Scale by incoming gradient
+                    else:
+                        # One-hot labels
+                        grad = (softmax_pred - target_data) / batch_size * gradient
 
-                predictions.backward(grad)
+                    predictions.backward(grad)
 
-        result.grad_fn = grad_fn
-        return result
\ No newline at end of file
+            result.grad_fn = grad_fn
+            return result
+        else:
+            # Basic tensor operation - no gradient tracking yet
+            return Tensor(loss)
\ No newline at end of file