Fix module dependency ordering - no forward references

- Parameter class now works with basic Tensors initially, upgrades to Variables when autograd available - Loss functions work with basic tensor operations before autograd module - Each module can now be built and tested sequentially without needing future modules - Modules 01-04 work with basic Tensors only - Module 05 introduces autograd, then earlier modules get gradient capabilities - Restored proper pedagogical flow for incremental learning
2026-06-03 05:40:54 -05:00 · 2025-09-29 10:54:14 -04:00
parent 39e102626d
commit 3036ef74ef
2 changed files with 222 additions and 107 deletions
--- a/modules/03_layers/layers_dev.py
+++ b/modules/03_layers/layers_dev.py
@@ -77,75 +77,116 @@ else:
    finally:
        sys.path.pop(0)  # Always clean up path to avoid side effects

-# CRITICAL FIX: Parameter must be Variable-based for gradient tracking
 class Parameter:
    """
-    A trainable parameter that supports automatic differentiation.
+    A trainable parameter that wraps a Tensor and supports gradient tracking.

-    This creates a Variable with requires_grad=True for use as neural network parameters.
-    Essential for gradient-based optimization of weights and biases.
+    Initially works with basic Tensors only (modules 01-04).
+    After module 05 (autograd), gets enhanced with automatic differentiation.

-    IMPORTANT: Parameters must participate in autograd for training to work.
+    This staged approach allows students to build and test layers before learning autograd.
    """
    def __init__(self, data):
-        # Import Variable locally to avoid circular imports
+        if isinstance(data, Tensor):
+            self._tensor = data
+        else:
+            # Convert numpy array or list to Tensor
+            self._tensor = Tensor(data)
+
+        # Initially no gradient tracking - will be enhanced after autograd module
+        self._grad = None
+        self._requires_grad = True  # Mark as trainable for future enhancement
+
+        # Try to upgrade to Variable if autograd is available (after module 05)
+        self._try_upgrade_to_variable()
+
+    def _try_upgrade_to_variable(self):
+        """Attempt to upgrade to Variable if autograd is available."""
        try:
+            # Try importing Variable (will work after module 05)
            from tinytorch.core.autograd import Variable
+
+            # Upgrade to Variable for gradient tracking
+            self._variable = Variable(self._tensor.data, requires_grad=True)
+            self._is_variable = True
        except ImportError:
-            # For development, import from local module
-            import sys
-            import os
-            sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
-            from autograd_dev import Variable
-
-        # Create Variable with gradient tracking enabled
-        if isinstance(data, Variable):
-            self._variable = data
-            if not data.requires_grad:
-                # Ensure parameters always require gradients
-                self._variable.requires_grad = True
-        else:
-            # Convert data to Variable with gradient tracking
-            self._variable = Variable(data, requires_grad=True)
-
-    def __getattr__(self, name):
-        """Delegate all attribute access to the underlying Variable."""
-        return getattr(self._variable, name)
-
-    def __setattr__(self, name, value):
-        """Handle setting attributes."""
-        if name == '_variable':
-            super().__setattr__(name, value)
-        else:
-            # Delegate to underlying Variable
-            setattr(self._variable, name, value)
+            # Autograd not yet available - stay as basic Parameter with Tensor
+            self._variable = None
+            self._is_variable = False

    @property
    def data(self):
        """Access to underlying data."""
-        return self._variable.data
+        if self._is_variable:
+            return self._variable.data
+        else:
+            return self._tensor.data
+
+    @property
+    def shape(self):
+        """Shape of the parameter tensor."""
+        if self._is_variable:
+            return self._variable.data.shape
+        else:
+            return self._tensor.shape

    @property
    def grad(self):
-        """Access to gradient."""
-        return self._variable.grad
+        """Access to gradient (None if autograd not available yet)."""
+        if self._is_variable:
+            return self._variable.grad
+        else:
+            return self._grad  # Will be None initially

    @grad.setter
    def grad(self, value):
        """Set gradient."""
-        self._variable.grad = value
+        if self._is_variable:
+            self._variable.grad = value
+        else:
+            self._grad = value

    @property
    def requires_grad(self):
        """Whether this parameter requires gradients."""
-        return self._variable.requires_grad
+        if self._is_variable:
+            return self._variable.requires_grad
+        else:
+            return self._requires_grad

    def backward(self, gradient=None):
-        """Backpropagate gradients."""
-        return self._variable.backward(gradient)
+        """Backpropagate gradients (only works after autograd module)."""
+        if self._is_variable:
+            return self._variable.backward(gradient)
+        else:
+            raise NotImplementedError("Gradient computation requires autograd module (module 05)")
+
+    def __add__(self, other):
+        """Addition operation."""
+        if self._is_variable:
+            return self._variable + other
+        else:
+            return self._tensor + other
+
+    def __mul__(self, other):
+        """Multiplication operation."""
+        if self._is_variable:
+            return self._variable * other
+        else:
+            return self._tensor * other
+
+    def __matmul__(self, other):
+        """Matrix multiplication."""
+        if self._is_variable:
+            return self._variable @ other
+        else:
+            return self._tensor @ other

    def __repr__(self):
-        return f"Parameter({self._variable})"
+        if self._is_variable:
+            return f"Parameter({self._variable})"
+        else:
+            return f"Parameter(Tensor({self._tensor.data.shape}), requires_grad={self._requires_grad})"

 # In[ ]:

--- a/modules/04_losses/losses_dev.py
+++ b/modules/04_losses/losses_dev.py
@@ -63,18 +63,75 @@ import numpy as np
 import sys
 import os

-# Import our building blocks - try package first, then local modules
+# Import our building blocks - Tensor first, autograd operations if available
 try:
    from tinytorch.core.tensor import Tensor
-    from tinytorch.core.autograd import Variable, subtract, multiply, add, matmul
-    # CRITICAL: Now using full autograd integration for proper gradient flow
-    # These losses will work with the autograd computational graph
 except ImportError:
    # For development, import from local modules
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
    from tensor_dev import Tensor
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
-    from autograd_dev import Variable, subtract, multiply, add, matmul
+
+# Try to import autograd operations if available (after module 05)
+# Initially losses work with basic tensors, get enhanced with autograd later
+_autograd_available = False
+try:
+    from tinytorch.core.autograd import Variable, subtract, multiply, add, matmul
+    _autograd_available = True
+except ImportError:
+    # Try development import
+    try:
+        sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
+        from autograd_dev import Variable, subtract, multiply, add, matmul
+        _autograd_available = True
+    except ImportError:
+        # Autograd not available yet - losses will work with basic tensor operations
+        # This is the expected case for modules 01-04
+        _autograd_available = False
+
+        # Define basic operations for tensors (will be replaced by autograd versions later)
+        def subtract(a, b):
+            """Basic subtraction for tensors (before autograd)."""
+            if hasattr(a, 'data') and hasattr(b, 'data'):
+                return Tensor(a.data - b.data)
+            elif hasattr(a, 'data'):
+                return Tensor(a.data - b)
+            elif hasattr(b, 'data'):
+                return Tensor(a - b.data)
+            else:
+                return Tensor(a - b)
+
+        def multiply(a, b):
+            """Basic multiplication for tensors (before autograd)."""
+            if hasattr(a, 'data') and hasattr(b, 'data'):
+                return Tensor(a.data * b.data)
+            elif hasattr(a, 'data'):
+                return Tensor(a.data * b)
+            elif hasattr(b, 'data'):
+                return Tensor(a * b.data)
+            else:
+                return Tensor(a * b)
+
+        def add(a, b):
+            """Basic addition for tensors (before autograd)."""
+            if hasattr(a, 'data') and hasattr(b, 'data'):
+                return Tensor(a.data + b.data)
+            elif hasattr(a, 'data'):
+                return Tensor(a.data + b)
+            elif hasattr(b, 'data'):
+                return Tensor(a + b.data)
+            else:
+                return Tensor(a + b)
+
+        def matmul(a, b):
+            """Basic matrix multiplication for tensors (before autograd)."""
+            if hasattr(a, 'data') and hasattr(b, 'data'):
+                return Tensor(a.data @ b.data)
+            elif hasattr(a, 'data'):
+                return Tensor(a.data @ b)
+            elif hasattr(b, 'data'):
+                return Tensor(a @ b.data)
+            else:
+                return Tensor(a @ b)

 # %% nbgrader={"grade": false, "grade_id": "losses-setup", "locked": false, "schema_version": 3, "solution": false, "task": false}
 print("FIRE TinyTorch Loss Functions Module")
@@ -2208,11 +2265,11 @@ to enable proper backpropagation through the computational graph.
 #| export
 class MSELoss:
    """
-    Mean Squared Error Loss with Autograd Integration
+    Mean Squared Error Loss - Works with both Tensors and Variables

-    This version properly integrates with the autograd system to enable
-    gradient flow during backpropagation. Unlike the basic MeanSquaredError
-    above, this returns a Variable that participates in the computational graph.
+    Initially works with basic Tensors (modules 01-04).
+    Automatically upgrades to use Variables when autograd is available (module 05+).
+    This staged approach allows testing loss functions before learning automatic differentiation.
    """

    def __init__(self):
@@ -2221,44 +2278,55 @@ class MSELoss:

    def __call__(self, predictions, targets):
        """
-        Compute MSE loss with autograd support.
+        Compute MSE loss.

        Args:
-            predictions: Model predictions (Variable or convertible to Variable)
-            targets: True targets (Variable or convertible to Variable)
+            predictions: Model predictions (Tensor/Variable)
+            targets: True targets (Tensor/Variable)

        Returns:
-            Variable with scalar loss value and gradient tracking
+            Scalar loss value (Tensor initially, Variable after autograd)
        """
-        # Ensure inputs are Variables for gradient tracking
-        if not isinstance(predictions, Variable):
+        if _autograd_available:
+            # Autograd available - use Variables for gradient tracking
+            if not isinstance(predictions, Variable):
+                pred_data = predictions.data if hasattr(predictions, 'data') else predictions
+                predictions = Variable(pred_data, requires_grad=False)
+
+            if not isinstance(targets, Variable):
+                target_data = targets.data if hasattr(targets, 'data') else targets
+                targets = Variable(target_data, requires_grad=False)
+
+            # Compute MSE using autograd operations
+            diff = subtract(predictions, targets)
+            squared_diff = multiply(diff, diff)
+
+            # Sum all elements and divide by count to get mean
+            loss = Variable.sum(squared_diff)
+
+            # Convert to mean (divide by number of elements)
+            batch_size = predictions.data.data.size
+            mean_loss = multiply(loss, 1.0 / batch_size)
+        else:
+            # Basic tensor operations - no gradient tracking yet
            pred_data = predictions.data if hasattr(predictions, 'data') else predictions
-            predictions = Variable(pred_data, requires_grad=False)
-
-        if not isinstance(targets, Variable):
            target_data = targets.data if hasattr(targets, 'data') else targets
-            targets = Variable(target_data, requires_grad=False)

-        # Compute MSE using autograd operations
-        diff = subtract(predictions, targets)
-        squared_diff = multiply(diff, diff)
-
-        # Sum all elements and divide by count to get mean
-        loss = Variable.sum(squared_diff)
-
-        # Convert to mean (divide by number of elements)
-        batch_size = predictions.data.data.size
-        mean_loss = multiply(loss, 1.0 / batch_size)
+            # Compute MSE using numpy operations
+            diff = pred_data - target_data
+            squared_diff = diff * diff
+            mean_loss = Tensor(np.mean(squared_diff))

        return mean_loss

 #| export
 class CrossEntropyLoss:
    """
-    Cross-Entropy Loss with Autograd Integration
+    Cross-Entropy Loss - Works with both Tensors and Variables

-    Simplified cross-entropy that works with the autograd system.
-    For training neural networks with gradient-based optimization.
+    Initially works with basic Tensors (modules 01-04).
+    Automatically upgrades to use Variables when autograd is available (module 05+).
+    This staged approach allows testing loss functions before learning automatic differentiation.
    """

    def __init__(self):
@@ -2267,27 +2335,29 @@ class CrossEntropyLoss:

    def __call__(self, predictions, targets):
        """
-        Compute cross-entropy loss with autograd support.
+        Compute cross-entropy loss.

        Args:
-            predictions: Model predictions/logits (Variable)
-            targets: True class indices (Variable or numpy array)
+            predictions: Model predictions/logits (Tensor/Variable)
+            targets: True class indices (Tensor/Variable or numpy array)

        Returns:
-            Variable with scalar loss value and gradient tracking
+            Scalar loss value (Tensor initially, Variable after autograd)
        """
-        # Handle Variable inputs
-        if isinstance(predictions, Variable):
-            pred_data = predictions.data.data
-        elif hasattr(predictions, 'data'):
-            pred_data = predictions.data
+        # Extract raw data from inputs
+        if hasattr(predictions, 'data'):
+            if hasattr(predictions.data, 'data'):  # Variable with nested data
+                pred_data = predictions.data.data
+            else:  # Tensor with data
+                pred_data = predictions.data
        else:
            pred_data = predictions

-        if isinstance(targets, Variable):
-            target_data = targets.data.data
-        elif hasattr(targets, 'data'):
-            target_data = targets.data
+        if hasattr(targets, 'data'):
+            if hasattr(targets.data, 'data'):  # Variable with nested data
+                target_data = targets.data.data
+            else:  # Tensor with data
+                target_data = targets.data
        else:
            target_data = targets

@@ -2311,27 +2381,31 @@ class CrossEntropyLoss:
            # One-hot labels
            loss = -np.mean(np.sum(target_data * np.log(softmax_pred), axis=-1))

-        # Return as Variable with gradient function
-        result = Variable(loss, requires_grad=True)
+        if _autograd_available:
+            # Return as Variable with gradient function
+            result = Variable(loss, requires_grad=True)

-        # Define backward function for proper gradient flow
-        def grad_fn(gradient):
-            if isinstance(predictions, Variable) and predictions.requires_grad:
-                batch_size = pred_data.shape[0]
+            # Define backward function for proper gradient flow
+            def grad_fn(gradient):
+                if isinstance(predictions, Variable) and predictions.requires_grad:
+                    batch_size = pred_data.shape[0]

-                # Gradient of cross-entropy with softmax
-                if len(target_data.shape) == 1 or target_data.shape[-1] == 1:
-                    # Integer labels - gradient is (softmax - one_hot_targets)
-                    grad = softmax_pred.copy()
-                    for i in range(batch_size):
-                        label = int(target_data[i])
-                        grad[i, label] -= 1
-                    grad = grad / batch_size * gradient  # Scale by incoming gradient
-                else:
-                    # One-hot labels
-                    grad = (softmax_pred - target_data) / batch_size * gradient
+                    # Gradient of cross-entropy with softmax
+                    if len(target_data.shape) == 1 or target_data.shape[-1] == 1:
+                        # Integer labels - gradient is (softmax - one_hot_targets)
+                        grad = softmax_pred.copy()
+                        for i in range(batch_size):
+                            label = int(target_data[i])
+                            grad[i, label] -= 1
+                        grad = grad / batch_size * gradient  # Scale by incoming gradient
+                    else:
+                        # One-hot labels
+                        grad = (softmax_pred - target_data) / batch_size * gradient

-                predictions.backward(grad)
+                    predictions.backward(grad)

-        result.grad_fn = grad_fn
-        return result
+            result.grad_fn = grad_fn
+            return result
+        else:
+            # Basic tensor operation - no gradient tracking yet
+            return Tensor(loss)