mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-03 05:40:54 -05:00
Fix module dependency ordering - no forward references
- Parameter class now works with basic Tensors initially, upgrades to Variables when autograd available - Loss functions work with basic tensor operations before autograd module - Each module can now be built and tested sequentially without needing future modules - Modules 01-04 work with basic Tensors only - Module 05 introduces autograd, then earlier modules get gradient capabilities - Restored proper pedagogical flow for incremental learning
This commit is contained in:
@@ -77,75 +77,116 @@ else:
|
||||
finally:
|
||||
sys.path.pop(0) # Always clean up path to avoid side effects
|
||||
|
||||
# CRITICAL FIX: Parameter must be Variable-based for gradient tracking
|
||||
class Parameter:
|
||||
"""
|
||||
A trainable parameter that supports automatic differentiation.
|
||||
A trainable parameter that wraps a Tensor and supports gradient tracking.
|
||||
|
||||
This creates a Variable with requires_grad=True for use as neural network parameters.
|
||||
Essential for gradient-based optimization of weights and biases.
|
||||
Initially works with basic Tensors only (modules 01-04).
|
||||
After module 05 (autograd), gets enhanced with automatic differentiation.
|
||||
|
||||
IMPORTANT: Parameters must participate in autograd for training to work.
|
||||
This staged approach allows students to build and test layers before learning autograd.
|
||||
"""
|
||||
def __init__(self, data):
|
||||
# Import Variable locally to avoid circular imports
|
||||
if isinstance(data, Tensor):
|
||||
self._tensor = data
|
||||
else:
|
||||
# Convert numpy array or list to Tensor
|
||||
self._tensor = Tensor(data)
|
||||
|
||||
# Initially no gradient tracking - will be enhanced after autograd module
|
||||
self._grad = None
|
||||
self._requires_grad = True # Mark as trainable for future enhancement
|
||||
|
||||
# Try to upgrade to Variable if autograd is available (after module 05)
|
||||
self._try_upgrade_to_variable()
|
||||
|
||||
def _try_upgrade_to_variable(self):
|
||||
"""Attempt to upgrade to Variable if autograd is available."""
|
||||
try:
|
||||
# Try importing Variable (will work after module 05)
|
||||
from tinytorch.core.autograd import Variable
|
||||
|
||||
# Upgrade to Variable for gradient tracking
|
||||
self._variable = Variable(self._tensor.data, requires_grad=True)
|
||||
self._is_variable = True
|
||||
except ImportError:
|
||||
# For development, import from local module
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
|
||||
from autograd_dev import Variable
|
||||
|
||||
# Create Variable with gradient tracking enabled
|
||||
if isinstance(data, Variable):
|
||||
self._variable = data
|
||||
if not data.requires_grad:
|
||||
# Ensure parameters always require gradients
|
||||
self._variable.requires_grad = True
|
||||
else:
|
||||
# Convert data to Variable with gradient tracking
|
||||
self._variable = Variable(data, requires_grad=True)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Delegate all attribute access to the underlying Variable."""
|
||||
return getattr(self._variable, name)
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
"""Handle setting attributes."""
|
||||
if name == '_variable':
|
||||
super().__setattr__(name, value)
|
||||
else:
|
||||
# Delegate to underlying Variable
|
||||
setattr(self._variable, name, value)
|
||||
# Autograd not yet available - stay as basic Parameter with Tensor
|
||||
self._variable = None
|
||||
self._is_variable = False
|
||||
|
||||
@property
|
||||
def data(self):
|
||||
"""Access to underlying data."""
|
||||
return self._variable.data
|
||||
if self._is_variable:
|
||||
return self._variable.data
|
||||
else:
|
||||
return self._tensor.data
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
"""Shape of the parameter tensor."""
|
||||
if self._is_variable:
|
||||
return self._variable.data.shape
|
||||
else:
|
||||
return self._tensor.shape
|
||||
|
||||
@property
|
||||
def grad(self):
|
||||
"""Access to gradient."""
|
||||
return self._variable.grad
|
||||
"""Access to gradient (None if autograd not available yet)."""
|
||||
if self._is_variable:
|
||||
return self._variable.grad
|
||||
else:
|
||||
return self._grad # Will be None initially
|
||||
|
||||
@grad.setter
|
||||
def grad(self, value):
|
||||
"""Set gradient."""
|
||||
self._variable.grad = value
|
||||
if self._is_variable:
|
||||
self._variable.grad = value
|
||||
else:
|
||||
self._grad = value
|
||||
|
||||
@property
|
||||
def requires_grad(self):
|
||||
"""Whether this parameter requires gradients."""
|
||||
return self._variable.requires_grad
|
||||
if self._is_variable:
|
||||
return self._variable.requires_grad
|
||||
else:
|
||||
return self._requires_grad
|
||||
|
||||
def backward(self, gradient=None):
|
||||
"""Backpropagate gradients."""
|
||||
return self._variable.backward(gradient)
|
||||
"""Backpropagate gradients (only works after autograd module)."""
|
||||
if self._is_variable:
|
||||
return self._variable.backward(gradient)
|
||||
else:
|
||||
raise NotImplementedError("Gradient computation requires autograd module (module 05)")
|
||||
|
||||
def __add__(self, other):
|
||||
"""Addition operation."""
|
||||
if self._is_variable:
|
||||
return self._variable + other
|
||||
else:
|
||||
return self._tensor + other
|
||||
|
||||
def __mul__(self, other):
|
||||
"""Multiplication operation."""
|
||||
if self._is_variable:
|
||||
return self._variable * other
|
||||
else:
|
||||
return self._tensor * other
|
||||
|
||||
def __matmul__(self, other):
|
||||
"""Matrix multiplication."""
|
||||
if self._is_variable:
|
||||
return self._variable @ other
|
||||
else:
|
||||
return self._tensor @ other
|
||||
|
||||
def __repr__(self):
|
||||
return f"Parameter({self._variable})"
|
||||
if self._is_variable:
|
||||
return f"Parameter({self._variable})"
|
||||
else:
|
||||
return f"Parameter(Tensor({self._tensor.data.shape}), requires_grad={self._requires_grad})"
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
@@ -63,18 +63,75 @@ import numpy as np
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Import our building blocks - try package first, then local modules
|
||||
# Import our building blocks - Tensor first, autograd operations if available
|
||||
try:
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import Variable, subtract, multiply, add, matmul
|
||||
# CRITICAL: Now using full autograd integration for proper gradient flow
|
||||
# These losses will work with the autograd computational graph
|
||||
except ImportError:
|
||||
# For development, import from local modules
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
|
||||
from tensor_dev import Tensor
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
|
||||
from autograd_dev import Variable, subtract, multiply, add, matmul
|
||||
|
||||
# Try to import autograd operations if available (after module 05)
|
||||
# Initially losses work with basic tensors, get enhanced with autograd later
|
||||
_autograd_available = False
|
||||
try:
|
||||
from tinytorch.core.autograd import Variable, subtract, multiply, add, matmul
|
||||
_autograd_available = True
|
||||
except ImportError:
|
||||
# Try development import
|
||||
try:
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '05_autograd'))
|
||||
from autograd_dev import Variable, subtract, multiply, add, matmul
|
||||
_autograd_available = True
|
||||
except ImportError:
|
||||
# Autograd not available yet - losses will work with basic tensor operations
|
||||
# This is the expected case for modules 01-04
|
||||
_autograd_available = False
|
||||
|
||||
# Define basic operations for tensors (will be replaced by autograd versions later)
|
||||
def subtract(a, b):
|
||||
"""Basic subtraction for tensors (before autograd)."""
|
||||
if hasattr(a, 'data') and hasattr(b, 'data'):
|
||||
return Tensor(a.data - b.data)
|
||||
elif hasattr(a, 'data'):
|
||||
return Tensor(a.data - b)
|
||||
elif hasattr(b, 'data'):
|
||||
return Tensor(a - b.data)
|
||||
else:
|
||||
return Tensor(a - b)
|
||||
|
||||
def multiply(a, b):
|
||||
"""Basic multiplication for tensors (before autograd)."""
|
||||
if hasattr(a, 'data') and hasattr(b, 'data'):
|
||||
return Tensor(a.data * b.data)
|
||||
elif hasattr(a, 'data'):
|
||||
return Tensor(a.data * b)
|
||||
elif hasattr(b, 'data'):
|
||||
return Tensor(a * b.data)
|
||||
else:
|
||||
return Tensor(a * b)
|
||||
|
||||
def add(a, b):
|
||||
"""Basic addition for tensors (before autograd)."""
|
||||
if hasattr(a, 'data') and hasattr(b, 'data'):
|
||||
return Tensor(a.data + b.data)
|
||||
elif hasattr(a, 'data'):
|
||||
return Tensor(a.data + b)
|
||||
elif hasattr(b, 'data'):
|
||||
return Tensor(a + b.data)
|
||||
else:
|
||||
return Tensor(a + b)
|
||||
|
||||
def matmul(a, b):
|
||||
"""Basic matrix multiplication for tensors (before autograd)."""
|
||||
if hasattr(a, 'data') and hasattr(b, 'data'):
|
||||
return Tensor(a.data @ b.data)
|
||||
elif hasattr(a, 'data'):
|
||||
return Tensor(a.data @ b)
|
||||
elif hasattr(b, 'data'):
|
||||
return Tensor(a @ b.data)
|
||||
else:
|
||||
return Tensor(a @ b)
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "losses-setup", "locked": false, "schema_version": 3, "solution": false, "task": false}
|
||||
print("FIRE TinyTorch Loss Functions Module")
|
||||
@@ -2208,11 +2265,11 @@ to enable proper backpropagation through the computational graph.
|
||||
#| export
|
||||
class MSELoss:
|
||||
"""
|
||||
Mean Squared Error Loss with Autograd Integration
|
||||
Mean Squared Error Loss - Works with both Tensors and Variables
|
||||
|
||||
This version properly integrates with the autograd system to enable
|
||||
gradient flow during backpropagation. Unlike the basic MeanSquaredError
|
||||
above, this returns a Variable that participates in the computational graph.
|
||||
Initially works with basic Tensors (modules 01-04).
|
||||
Automatically upgrades to use Variables when autograd is available (module 05+).
|
||||
This staged approach allows testing loss functions before learning automatic differentiation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -2221,44 +2278,55 @@ class MSELoss:
|
||||
|
||||
def __call__(self, predictions, targets):
|
||||
"""
|
||||
Compute MSE loss with autograd support.
|
||||
Compute MSE loss.
|
||||
|
||||
Args:
|
||||
predictions: Model predictions (Variable or convertible to Variable)
|
||||
targets: True targets (Variable or convertible to Variable)
|
||||
predictions: Model predictions (Tensor/Variable)
|
||||
targets: True targets (Tensor/Variable)
|
||||
|
||||
Returns:
|
||||
Variable with scalar loss value and gradient tracking
|
||||
Scalar loss value (Tensor initially, Variable after autograd)
|
||||
"""
|
||||
# Ensure inputs are Variables for gradient tracking
|
||||
if not isinstance(predictions, Variable):
|
||||
if _autograd_available:
|
||||
# Autograd available - use Variables for gradient tracking
|
||||
if not isinstance(predictions, Variable):
|
||||
pred_data = predictions.data if hasattr(predictions, 'data') else predictions
|
||||
predictions = Variable(pred_data, requires_grad=False)
|
||||
|
||||
if not isinstance(targets, Variable):
|
||||
target_data = targets.data if hasattr(targets, 'data') else targets
|
||||
targets = Variable(target_data, requires_grad=False)
|
||||
|
||||
# Compute MSE using autograd operations
|
||||
diff = subtract(predictions, targets)
|
||||
squared_diff = multiply(diff, diff)
|
||||
|
||||
# Sum all elements and divide by count to get mean
|
||||
loss = Variable.sum(squared_diff)
|
||||
|
||||
# Convert to mean (divide by number of elements)
|
||||
batch_size = predictions.data.data.size
|
||||
mean_loss = multiply(loss, 1.0 / batch_size)
|
||||
else:
|
||||
# Basic tensor operations - no gradient tracking yet
|
||||
pred_data = predictions.data if hasattr(predictions, 'data') else predictions
|
||||
predictions = Variable(pred_data, requires_grad=False)
|
||||
|
||||
if not isinstance(targets, Variable):
|
||||
target_data = targets.data if hasattr(targets, 'data') else targets
|
||||
targets = Variable(target_data, requires_grad=False)
|
||||
|
||||
# Compute MSE using autograd operations
|
||||
diff = subtract(predictions, targets)
|
||||
squared_diff = multiply(diff, diff)
|
||||
|
||||
# Sum all elements and divide by count to get mean
|
||||
loss = Variable.sum(squared_diff)
|
||||
|
||||
# Convert to mean (divide by number of elements)
|
||||
batch_size = predictions.data.data.size
|
||||
mean_loss = multiply(loss, 1.0 / batch_size)
|
||||
# Compute MSE using numpy operations
|
||||
diff = pred_data - target_data
|
||||
squared_diff = diff * diff
|
||||
mean_loss = Tensor(np.mean(squared_diff))
|
||||
|
||||
return mean_loss
|
||||
|
||||
#| export
|
||||
class CrossEntropyLoss:
|
||||
"""
|
||||
Cross-Entropy Loss with Autograd Integration
|
||||
Cross-Entropy Loss - Works with both Tensors and Variables
|
||||
|
||||
Simplified cross-entropy that works with the autograd system.
|
||||
For training neural networks with gradient-based optimization.
|
||||
Initially works with basic Tensors (modules 01-04).
|
||||
Automatically upgrades to use Variables when autograd is available (module 05+).
|
||||
This staged approach allows testing loss functions before learning automatic differentiation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -2267,27 +2335,29 @@ class CrossEntropyLoss:
|
||||
|
||||
def __call__(self, predictions, targets):
|
||||
"""
|
||||
Compute cross-entropy loss with autograd support.
|
||||
Compute cross-entropy loss.
|
||||
|
||||
Args:
|
||||
predictions: Model predictions/logits (Variable)
|
||||
targets: True class indices (Variable or numpy array)
|
||||
predictions: Model predictions/logits (Tensor/Variable)
|
||||
targets: True class indices (Tensor/Variable or numpy array)
|
||||
|
||||
Returns:
|
||||
Variable with scalar loss value and gradient tracking
|
||||
Scalar loss value (Tensor initially, Variable after autograd)
|
||||
"""
|
||||
# Handle Variable inputs
|
||||
if isinstance(predictions, Variable):
|
||||
pred_data = predictions.data.data
|
||||
elif hasattr(predictions, 'data'):
|
||||
pred_data = predictions.data
|
||||
# Extract raw data from inputs
|
||||
if hasattr(predictions, 'data'):
|
||||
if hasattr(predictions.data, 'data'): # Variable with nested data
|
||||
pred_data = predictions.data.data
|
||||
else: # Tensor with data
|
||||
pred_data = predictions.data
|
||||
else:
|
||||
pred_data = predictions
|
||||
|
||||
if isinstance(targets, Variable):
|
||||
target_data = targets.data.data
|
||||
elif hasattr(targets, 'data'):
|
||||
target_data = targets.data
|
||||
if hasattr(targets, 'data'):
|
||||
if hasattr(targets.data, 'data'): # Variable with nested data
|
||||
target_data = targets.data.data
|
||||
else: # Tensor with data
|
||||
target_data = targets.data
|
||||
else:
|
||||
target_data = targets
|
||||
|
||||
@@ -2311,27 +2381,31 @@ class CrossEntropyLoss:
|
||||
# One-hot labels
|
||||
loss = -np.mean(np.sum(target_data * np.log(softmax_pred), axis=-1))
|
||||
|
||||
# Return as Variable with gradient function
|
||||
result = Variable(loss, requires_grad=True)
|
||||
if _autograd_available:
|
||||
# Return as Variable with gradient function
|
||||
result = Variable(loss, requires_grad=True)
|
||||
|
||||
# Define backward function for proper gradient flow
|
||||
def grad_fn(gradient):
|
||||
if isinstance(predictions, Variable) and predictions.requires_grad:
|
||||
batch_size = pred_data.shape[0]
|
||||
# Define backward function for proper gradient flow
|
||||
def grad_fn(gradient):
|
||||
if isinstance(predictions, Variable) and predictions.requires_grad:
|
||||
batch_size = pred_data.shape[0]
|
||||
|
||||
# Gradient of cross-entropy with softmax
|
||||
if len(target_data.shape) == 1 or target_data.shape[-1] == 1:
|
||||
# Integer labels - gradient is (softmax - one_hot_targets)
|
||||
grad = softmax_pred.copy()
|
||||
for i in range(batch_size):
|
||||
label = int(target_data[i])
|
||||
grad[i, label] -= 1
|
||||
grad = grad / batch_size * gradient # Scale by incoming gradient
|
||||
else:
|
||||
# One-hot labels
|
||||
grad = (softmax_pred - target_data) / batch_size * gradient
|
||||
# Gradient of cross-entropy with softmax
|
||||
if len(target_data.shape) == 1 or target_data.shape[-1] == 1:
|
||||
# Integer labels - gradient is (softmax - one_hot_targets)
|
||||
grad = softmax_pred.copy()
|
||||
for i in range(batch_size):
|
||||
label = int(target_data[i])
|
||||
grad[i, label] -= 1
|
||||
grad = grad / batch_size * gradient # Scale by incoming gradient
|
||||
else:
|
||||
# One-hot labels
|
||||
grad = (softmax_pred - target_data) / batch_size * gradient
|
||||
|
||||
predictions.backward(grad)
|
||||
predictions.backward(grad)
|
||||
|
||||
result.grad_fn = grad_fn
|
||||
return result
|
||||
result.grad_fn = grad_fn
|
||||
return result
|
||||
else:
|
||||
# Basic tensor operation - no gradient tracking yet
|
||||
return Tensor(loss)
|
||||
Reference in New Issue
Block a user