From 3036ef74ef6533bb7a45a583a69fca68a7bf81a1 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Mon, 29 Sep 2025 10:54:14 -0400 Subject: [PATCH] Fix module dependency ordering - no forward references - Parameter class now works with basic Tensors initially, upgrades to Variables when autograd available - Loss functions work with basic tensor operations before autograd module - Each module can now be built and tested sequentially without needing future modules - Modules 01-04 work with basic Tensors only - Module 05 introduces autograd, then earlier modules get gradient capabilities - Restored proper pedagogical flow for incremental learning --- modules/03_layers/layers_dev.py | 123 ++++++++++++------- modules/04_losses/losses_dev.py | 206 ++++++++++++++++++++++---------- 2 files changed, 222 insertions(+), 107 deletions(-) diff --git a/modules/03_layers/layers_dev.py b/modules/03_layers/layers_dev.py index 9f417e8a..795fcfab 100644 --- a/modules/03_layers/layers_dev.py +++ b/modules/03_layers/layers_dev.py @@ -77,75 +77,116 @@ else: finally: sys.path.pop(0) # Always clean up path to avoid side effects -# CRITICAL FIX: Parameter must be Variable-based for gradient tracking class Parameter: """ - A trainable parameter that supports automatic differentiation. + A trainable parameter that wraps a Tensor and supports gradient tracking. - This creates a Variable with requires_grad=True for use as neural network parameters. - Essential for gradient-based optimization of weights and biases. + Initially works with basic Tensors only (modules 01-04). + After module 05 (autograd), gets enhanced with automatic differentiation. - IMPORTANT: Parameters must participate in autograd for training to work. + This staged approach allows students to build and test layers before learning autograd. """ def __init__(self, data): - # Import Variable locally to avoid circular imports + if isinstance(data, Tensor): + self._tensor = data + else: + # Convert numpy array or list to Tensor + self._tensor = Tensor(data) + + # Initially no gradient tracking - will be enhanced after autograd module + self._grad = None + self._requires_grad = True # Mark as trainable for future enhancement + + # Try to upgrade to Variable if autograd is available (after module 05) + self._try_upgrade_to_variable() + + def _try_upgrade_to_variable(self): + """Attempt to upgrade to Variable if autograd is available.""" try: + # Try importing Variable (will work after module 05) from tinytorch.core.autograd import Variable + + # Upgrade to Variable for gradient tracking + self._variable = Variable(self._tensor.data, requires_grad=True) + self._is_variable = True except ImportError: - # For development, import from local module - import sys - import os - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd')) - from autograd_dev import Variable - - # Create Variable with gradient tracking enabled - if isinstance(data, Variable): - self._variable = data - if not data.requires_grad: - # Ensure parameters always require gradients - self._variable.requires_grad = True - else: - # Convert data to Variable with gradient tracking - self._variable = Variable(data, requires_grad=True) - - def __getattr__(self, name): - """Delegate all attribute access to the underlying Variable.""" - return getattr(self._variable, name) - - def __setattr__(self, name, value): - """Handle setting attributes.""" - if name == '_variable': - super().__setattr__(name, value) - else: - # Delegate to underlying Variable - setattr(self._variable, name, value) + # Autograd not yet available - stay as basic Parameter with Tensor + self._variable = None + self._is_variable = False @property def data(self): """Access to underlying data.""" - return self._variable.data + if self._is_variable: + return self._variable.data + else: + return self._tensor.data + + @property + def shape(self): + """Shape of the parameter tensor.""" + if self._is_variable: + return self._variable.data.shape + else: + return self._tensor.shape @property def grad(self): - """Access to gradient.""" - return self._variable.grad + """Access to gradient (None if autograd not available yet).""" + if self._is_variable: + return self._variable.grad + else: + return self._grad # Will be None initially @grad.setter def grad(self, value): """Set gradient.""" - self._variable.grad = value + if self._is_variable: + self._variable.grad = value + else: + self._grad = value @property def requires_grad(self): """Whether this parameter requires gradients.""" - return self._variable.requires_grad + if self._is_variable: + return self._variable.requires_grad + else: + return self._requires_grad def backward(self, gradient=None): - """Backpropagate gradients.""" - return self._variable.backward(gradient) + """Backpropagate gradients (only works after autograd module).""" + if self._is_variable: + return self._variable.backward(gradient) + else: + raise NotImplementedError("Gradient computation requires autograd module (module 05)") + + def __add__(self, other): + """Addition operation.""" + if self._is_variable: + return self._variable + other + else: + return self._tensor + other + + def __mul__(self, other): + """Multiplication operation.""" + if self._is_variable: + return self._variable * other + else: + return self._tensor * other + + def __matmul__(self, other): + """Matrix multiplication.""" + if self._is_variable: + return self._variable @ other + else: + return self._tensor @ other def __repr__(self): - return f"Parameter({self._variable})" + if self._is_variable: + return f"Parameter({self._variable})" + else: + return f"Parameter(Tensor({self._tensor.data.shape}), requires_grad={self._requires_grad})" # In[ ]: diff --git a/modules/04_losses/losses_dev.py b/modules/04_losses/losses_dev.py index 5c021137..2d1dd4d1 100644 --- a/modules/04_losses/losses_dev.py +++ b/modules/04_losses/losses_dev.py @@ -63,18 +63,75 @@ import numpy as np import sys import os -# Import our building blocks - try package first, then local modules +# Import our building blocks - Tensor first, autograd operations if available try: from tinytorch.core.tensor import Tensor - from tinytorch.core.autograd import Variable, subtract, multiply, add, matmul - # CRITICAL: Now using full autograd integration for proper gradient flow - # These losses will work with the autograd computational graph except ImportError: # For development, import from local modules sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor - sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd')) - from autograd_dev import Variable, subtract, multiply, add, matmul + +# Try to import autograd operations if available (after module 05) +# Initially losses work with basic tensors, get enhanced with autograd later +_autograd_available = False +try: + from tinytorch.core.autograd import Variable, subtract, multiply, add, matmul + _autograd_available = True +except ImportError: + # Try development import + try: + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd')) + from autograd_dev import Variable, subtract, multiply, add, matmul + _autograd_available = True + except ImportError: + # Autograd not available yet - losses will work with basic tensor operations + # This is the expected case for modules 01-04 + _autograd_available = False + + # Define basic operations for tensors (will be replaced by autograd versions later) + def subtract(a, b): + """Basic subtraction for tensors (before autograd).""" + if hasattr(a, 'data') and hasattr(b, 'data'): + return Tensor(a.data - b.data) + elif hasattr(a, 'data'): + return Tensor(a.data - b) + elif hasattr(b, 'data'): + return Tensor(a - b.data) + else: + return Tensor(a - b) + + def multiply(a, b): + """Basic multiplication for tensors (before autograd).""" + if hasattr(a, 'data') and hasattr(b, 'data'): + return Tensor(a.data * b.data) + elif hasattr(a, 'data'): + return Tensor(a.data * b) + elif hasattr(b, 'data'): + return Tensor(a * b.data) + else: + return Tensor(a * b) + + def add(a, b): + """Basic addition for tensors (before autograd).""" + if hasattr(a, 'data') and hasattr(b, 'data'): + return Tensor(a.data + b.data) + elif hasattr(a, 'data'): + return Tensor(a.data + b) + elif hasattr(b, 'data'): + return Tensor(a + b.data) + else: + return Tensor(a + b) + + def matmul(a, b): + """Basic matrix multiplication for tensors (before autograd).""" + if hasattr(a, 'data') and hasattr(b, 'data'): + return Tensor(a.data @ b.data) + elif hasattr(a, 'data'): + return Tensor(a.data @ b) + elif hasattr(b, 'data'): + return Tensor(a @ b.data) + else: + return Tensor(a @ b) # %% nbgrader={"grade": false, "grade_id": "losses-setup", "locked": false, "schema_version": 3, "solution": false, "task": false} print("FIRE TinyTorch Loss Functions Module") @@ -2208,11 +2265,11 @@ to enable proper backpropagation through the computational graph. #| export class MSELoss: """ - Mean Squared Error Loss with Autograd Integration + Mean Squared Error Loss - Works with both Tensors and Variables - This version properly integrates with the autograd system to enable - gradient flow during backpropagation. Unlike the basic MeanSquaredError - above, this returns a Variable that participates in the computational graph. + Initially works with basic Tensors (modules 01-04). + Automatically upgrades to use Variables when autograd is available (module 05+). + This staged approach allows testing loss functions before learning automatic differentiation. """ def __init__(self): @@ -2221,44 +2278,55 @@ class MSELoss: def __call__(self, predictions, targets): """ - Compute MSE loss with autograd support. + Compute MSE loss. Args: - predictions: Model predictions (Variable or convertible to Variable) - targets: True targets (Variable or convertible to Variable) + predictions: Model predictions (Tensor/Variable) + targets: True targets (Tensor/Variable) Returns: - Variable with scalar loss value and gradient tracking + Scalar loss value (Tensor initially, Variable after autograd) """ - # Ensure inputs are Variables for gradient tracking - if not isinstance(predictions, Variable): + if _autograd_available: + # Autograd available - use Variables for gradient tracking + if not isinstance(predictions, Variable): + pred_data = predictions.data if hasattr(predictions, 'data') else predictions + predictions = Variable(pred_data, requires_grad=False) + + if not isinstance(targets, Variable): + target_data = targets.data if hasattr(targets, 'data') else targets + targets = Variable(target_data, requires_grad=False) + + # Compute MSE using autograd operations + diff = subtract(predictions, targets) + squared_diff = multiply(diff, diff) + + # Sum all elements and divide by count to get mean + loss = Variable.sum(squared_diff) + + # Convert to mean (divide by number of elements) + batch_size = predictions.data.data.size + mean_loss = multiply(loss, 1.0 / batch_size) + else: + # Basic tensor operations - no gradient tracking yet pred_data = predictions.data if hasattr(predictions, 'data') else predictions - predictions = Variable(pred_data, requires_grad=False) - - if not isinstance(targets, Variable): target_data = targets.data if hasattr(targets, 'data') else targets - targets = Variable(target_data, requires_grad=False) - # Compute MSE using autograd operations - diff = subtract(predictions, targets) - squared_diff = multiply(diff, diff) - - # Sum all elements and divide by count to get mean - loss = Variable.sum(squared_diff) - - # Convert to mean (divide by number of elements) - batch_size = predictions.data.data.size - mean_loss = multiply(loss, 1.0 / batch_size) + # Compute MSE using numpy operations + diff = pred_data - target_data + squared_diff = diff * diff + mean_loss = Tensor(np.mean(squared_diff)) return mean_loss #| export class CrossEntropyLoss: """ - Cross-Entropy Loss with Autograd Integration + Cross-Entropy Loss - Works with both Tensors and Variables - Simplified cross-entropy that works with the autograd system. - For training neural networks with gradient-based optimization. + Initially works with basic Tensors (modules 01-04). + Automatically upgrades to use Variables when autograd is available (module 05+). + This staged approach allows testing loss functions before learning automatic differentiation. """ def __init__(self): @@ -2267,27 +2335,29 @@ class CrossEntropyLoss: def __call__(self, predictions, targets): """ - Compute cross-entropy loss with autograd support. + Compute cross-entropy loss. Args: - predictions: Model predictions/logits (Variable) - targets: True class indices (Variable or numpy array) + predictions: Model predictions/logits (Tensor/Variable) + targets: True class indices (Tensor/Variable or numpy array) Returns: - Variable with scalar loss value and gradient tracking + Scalar loss value (Tensor initially, Variable after autograd) """ - # Handle Variable inputs - if isinstance(predictions, Variable): - pred_data = predictions.data.data - elif hasattr(predictions, 'data'): - pred_data = predictions.data + # Extract raw data from inputs + if hasattr(predictions, 'data'): + if hasattr(predictions.data, 'data'): # Variable with nested data + pred_data = predictions.data.data + else: # Tensor with data + pred_data = predictions.data else: pred_data = predictions - if isinstance(targets, Variable): - target_data = targets.data.data - elif hasattr(targets, 'data'): - target_data = targets.data + if hasattr(targets, 'data'): + if hasattr(targets.data, 'data'): # Variable with nested data + target_data = targets.data.data + else: # Tensor with data + target_data = targets.data else: target_data = targets @@ -2311,27 +2381,31 @@ class CrossEntropyLoss: # One-hot labels loss = -np.mean(np.sum(target_data * np.log(softmax_pred), axis=-1)) - # Return as Variable with gradient function - result = Variable(loss, requires_grad=True) + if _autograd_available: + # Return as Variable with gradient function + result = Variable(loss, requires_grad=True) - # Define backward function for proper gradient flow - def grad_fn(gradient): - if isinstance(predictions, Variable) and predictions.requires_grad: - batch_size = pred_data.shape[0] + # Define backward function for proper gradient flow + def grad_fn(gradient): + if isinstance(predictions, Variable) and predictions.requires_grad: + batch_size = pred_data.shape[0] - # Gradient of cross-entropy with softmax - if len(target_data.shape) == 1 or target_data.shape[-1] == 1: - # Integer labels - gradient is (softmax - one_hot_targets) - grad = softmax_pred.copy() - for i in range(batch_size): - label = int(target_data[i]) - grad[i, label] -= 1 - grad = grad / batch_size * gradient # Scale by incoming gradient - else: - # One-hot labels - grad = (softmax_pred - target_data) / batch_size * gradient + # Gradient of cross-entropy with softmax + if len(target_data.shape) == 1 or target_data.shape[-1] == 1: + # Integer labels - gradient is (softmax - one_hot_targets) + grad = softmax_pred.copy() + for i in range(batch_size): + label = int(target_data[i]) + grad[i, label] -= 1 + grad = grad / batch_size * gradient # Scale by incoming gradient + else: + # One-hot labels + grad = (softmax_pred - target_data) / batch_size * gradient - predictions.backward(grad) + predictions.backward(grad) - result.grad_fn = grad_fn - return result \ No newline at end of file + result.grad_fn = grad_fn + return result + else: + # Basic tensor operation - no gradient tracking yet + return Tensor(loss) \ No newline at end of file