mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-30 13:05:52 -05:00
This commit includes: - Exported tinytorch package files from nbdev (autograd, losses, optimizers, training, etc.) - Updated activations.py and layers.py with __call__ methods - New module exports: attention, spatial, tokenization, transformer, etc. - Removed old _modidx.py file - Cleanup of duplicate milestone directories These are the generated package files that correspond to the source modules we've been developing. Students will import from these when using TinyTorch.
466 lines
17 KiB
Python
Generated
466 lines
17 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/01_tensor/tensor_dev.ipynb.
|
||
|
||
# %% auto 0
|
||
__all__ = ['Tensor']
|
||
|
||
# %% ../../modules/source/01_tensor/tensor_dev.ipynb 1
|
||
import numpy as np
|
||
|
||
# %% ../../modules/source/01_tensor/tensor_dev.ipynb 6
|
||
class Tensor:
|
||
"""Educational tensor that grows with student knowledge.
|
||
|
||
This class starts simple but includes dormant features for future modules:
|
||
- requires_grad: Will be used for automatic differentiation (Module 05)
|
||
- grad: Will store computed gradients (Module 05)
|
||
- backward(): Will compute gradients (Module 05)
|
||
|
||
For now, focus on: data, shape, and basic operations.
|
||
"""
|
||
|
||
def __init__(self, data, requires_grad=False):
|
||
"""
|
||
Create a new tensor from data.
|
||
|
||
TODO: Initialize tensor attributes
|
||
|
||
APPROACH:
|
||
1. Convert data to NumPy array - handles lists, scalars, etc.
|
||
2. Store shape and size for quick access
|
||
3. Set up gradient tracking (dormant until Module 05)
|
||
|
||
EXAMPLE:
|
||
>>> tensor = Tensor([1, 2, 3])
|
||
>>> print(tensor.data)
|
||
[1 2 3]
|
||
>>> print(tensor.shape)
|
||
(3,)
|
||
|
||
HINT: np.array() handles type conversion automatically
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Core tensor data - always present
|
||
self.data = np.array(data, dtype=np.float32) # Consistent float32 for ML
|
||
self.shape = self.data.shape
|
||
self.size = self.data.size
|
||
self.dtype = self.data.dtype
|
||
|
||
# Gradient features (dormant until Module 05)
|
||
self.requires_grad = requires_grad
|
||
self.grad = None
|
||
### END SOLUTION
|
||
|
||
def __repr__(self):
|
||
"""String representation of tensor for debugging."""
|
||
grad_info = f", requires_grad={self.requires_grad}" if self.requires_grad else ""
|
||
return f"Tensor(data={self.data}, shape={self.shape}{grad_info})"
|
||
|
||
def __str__(self):
|
||
"""Human-readable string representation."""
|
||
return f"Tensor({self.data})"
|
||
|
||
def numpy(self):
|
||
"""Return the underlying NumPy array."""
|
||
return self.data
|
||
|
||
# nbgrader={\"grade\": false, \"grade_id\": \"addition-impl\", \"solution\": true}
|
||
def __add__(self, other):
|
||
"""
|
||
Add two tensors element-wise with broadcasting support.
|
||
|
||
TODO: Implement tensor addition with automatic broadcasting
|
||
|
||
APPROACH:
|
||
1. Handle both Tensor and scalar inputs
|
||
2. Use NumPy's broadcasting for automatic shape alignment
|
||
3. Return new Tensor with result (don't modify self)
|
||
|
||
EXAMPLE:
|
||
>>> a = Tensor([1, 2, 3])
|
||
>>> b = Tensor([4, 5, 6])
|
||
>>> result = a + b
|
||
>>> print(result.data)
|
||
[5. 7. 9.]
|
||
|
||
BROADCASTING EXAMPLE:
|
||
>>> matrix = Tensor([[1, 2], [3, 4]]) # Shape: (2, 2)
|
||
>>> vector = Tensor([10, 20]) # Shape: (2,)
|
||
>>> result = matrix + vector # Broadcasting: (2,2) + (2,) → (2,2)
|
||
>>> print(result.data)
|
||
[[11. 22.]
|
||
[13. 24.]]
|
||
|
||
HINTS:
|
||
- Use isinstance() to check if other is a Tensor
|
||
- NumPy handles broadcasting automatically with +
|
||
- Always return a new Tensor, don't modify self
|
||
- Preserve gradient tracking for future modules
|
||
"""
|
||
### BEGIN SOLUTION
|
||
if isinstance(other, Tensor):
|
||
# Tensor + Tensor: let NumPy handle broadcasting
|
||
result_data = self.data + other.data
|
||
else:
|
||
# Tensor + scalar: NumPy broadcasts automatically
|
||
result_data = self.data + other
|
||
|
||
# Create new tensor with result
|
||
result = Tensor(result_data)
|
||
|
||
# Preserve gradient tracking if either operand requires gradients
|
||
if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):
|
||
result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)
|
||
elif hasattr(self, 'requires_grad'):
|
||
result.requires_grad = self.requires_grad
|
||
|
||
return result
|
||
### END SOLUTION
|
||
|
||
# nbgrader={"grade": false, "grade_id": "more-arithmetic", "solution": true}
|
||
def __sub__(self, other):
|
||
"""
|
||
Subtract two tensors element-wise.
|
||
|
||
Common use: Centering data (x - mean), computing differences for loss functions.
|
||
"""
|
||
if isinstance(other, Tensor):
|
||
return Tensor(self.data - other.data)
|
||
else:
|
||
return Tensor(self.data - other)
|
||
|
||
def __mul__(self, other):
|
||
"""
|
||
Multiply two tensors element-wise (NOT matrix multiplication).
|
||
|
||
Common use: Scaling features, applying masks, gating mechanisms in neural networks.
|
||
Note: This is * operator, not @ (which will be matrix multiplication).
|
||
"""
|
||
if isinstance(other, Tensor):
|
||
return Tensor(self.data * other.data)
|
||
else:
|
||
return Tensor(self.data * other)
|
||
|
||
def __truediv__(self, other):
|
||
"""
|
||
Divide two tensors element-wise.
|
||
|
||
Common use: Normalization (x / std), converting counts to probabilities.
|
||
"""
|
||
if isinstance(other, Tensor):
|
||
return Tensor(self.data / other.data)
|
||
else:
|
||
return Tensor(self.data / other)
|
||
|
||
# nbgrader={"grade": false, "grade_id": "matmul-impl", "solution": true}
|
||
def matmul(self, other):
|
||
"""
|
||
Matrix multiplication of two tensors.
|
||
|
||
TODO: Implement matrix multiplication using np.dot with proper validation
|
||
|
||
APPROACH:
|
||
1. Validate inputs are Tensors
|
||
2. Check dimension compatibility (inner dimensions must match)
|
||
3. Use np.dot for optimized computation
|
||
4. Return new Tensor with result
|
||
|
||
EXAMPLE:
|
||
>>> a = Tensor([[1, 2], [3, 4]]) # 2×2
|
||
>>> b = Tensor([[5, 6], [7, 8]]) # 2×2
|
||
>>> result = a.matmul(b) # 2×2 result
|
||
>>> # Result: [[1×5+2×7, 1×6+2×8], [3×5+4×7, 3×6+4×8]] = [[19, 22], [43, 50]]
|
||
|
||
SHAPE RULES:
|
||
- (M, K) @ (K, N) → (M, N) ✓ Valid
|
||
- (M, K) @ (J, N) → Error ✗ K ≠ J
|
||
|
||
COMPLEXITY: O(M×N×K) for (M×K) @ (K×N) matrices
|
||
|
||
HINTS:
|
||
- np.dot handles the optimization for us
|
||
- Check self.shape[-1] == other.shape[-2] for compatibility
|
||
- Provide clear error messages for debugging
|
||
"""
|
||
### BEGIN SOLUTION
|
||
if not isinstance(other, Tensor):
|
||
raise TypeError(f"Expected Tensor for matrix multiplication, got {type(other)}")
|
||
|
||
# Handle edge cases
|
||
if self.shape == () or other.shape == ():
|
||
# Scalar multiplication
|
||
return Tensor(self.data * other.data)
|
||
|
||
# For matrix multiplication, we need at least 1D tensors
|
||
if len(self.shape) == 0 or len(other.shape) == 0:
|
||
return Tensor(self.data * other.data)
|
||
|
||
# Check dimension compatibility for matrix multiplication
|
||
if len(self.shape) >= 2 and len(other.shape) >= 2:
|
||
if self.shape[-1] != other.shape[-2]:
|
||
raise ValueError(
|
||
f"Cannot perform matrix multiplication: {self.shape} @ {other.shape}. "
|
||
f"Inner dimensions must match: {self.shape[-1]} ≠ {other.shape[-2]}. "
|
||
f"💡 HINT: For (M,K) @ (K,N) → (M,N), the K dimensions must be equal."
|
||
)
|
||
elif len(self.shape) == 1 and len(other.shape) == 2:
|
||
# Vector @ Matrix
|
||
if self.shape[0] != other.shape[0]:
|
||
raise ValueError(
|
||
f"Cannot multiply vector {self.shape} with matrix {other.shape}. "
|
||
f"Vector length {self.shape[0]} must match matrix rows {other.shape[0]}."
|
||
)
|
||
elif len(self.shape) == 2 and len(other.shape) == 1:
|
||
# Matrix @ Vector
|
||
if self.shape[1] != other.shape[0]:
|
||
raise ValueError(
|
||
f"Cannot multiply matrix {self.shape} with vector {other.shape}. "
|
||
f"Matrix columns {self.shape[1]} must match vector length {other.shape[0]}."
|
||
)
|
||
|
||
# Perform optimized matrix multiplication
|
||
result_data = np.dot(self.data, other.data)
|
||
return Tensor(result_data)
|
||
### END SOLUTION
|
||
|
||
# nbgrader={"grade": false, "grade_id": "shape-ops", "solution": true}
|
||
def reshape(self, *shape):
|
||
"""
|
||
Reshape tensor to new dimensions.
|
||
|
||
TODO: Implement tensor reshaping with validation
|
||
|
||
APPROACH:
|
||
1. Handle different calling conventions: reshape(2, 3) vs reshape((2, 3))
|
||
2. Validate total elements remain the same
|
||
3. Use NumPy's reshape for the actual operation
|
||
4. Return new Tensor (keep immutability)
|
||
|
||
EXAMPLE:
|
||
>>> tensor = Tensor([1, 2, 3, 4, 5, 6]) # Shape: (6,)
|
||
>>> reshaped = tensor.reshape(2, 3) # Shape: (2, 3)
|
||
>>> print(reshaped.data)
|
||
[[1. 2. 3.]
|
||
[4. 5. 6.]]
|
||
|
||
COMMON USAGE:
|
||
>>> # Flatten for MLP input
|
||
>>> image = Tensor(np.random.rand(3, 32, 32)) # (channels, height, width)
|
||
>>> flattened = image.reshape(-1) # (3072,) - all pixels in vector
|
||
>>>
|
||
>>> # Prepare batch for convolution
|
||
>>> batch = Tensor(np.random.rand(32, 784)) # (batch, features)
|
||
>>> images = batch.reshape(32, 1, 28, 28) # (batch, channels, height, width)
|
||
|
||
HINTS:
|
||
- Handle both reshape(2, 3) and reshape((2, 3)) calling styles
|
||
- Check np.prod(new_shape) == self.size for validation
|
||
- Use descriptive error messages for debugging
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Handle both reshape(2, 3) and reshape((2, 3)) calling conventions
|
||
if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
|
||
new_shape = tuple(shape[0])
|
||
else:
|
||
new_shape = shape
|
||
|
||
# Handle -1 for automatic dimension inference (like NumPy)
|
||
if -1 in new_shape:
|
||
if new_shape.count(-1) > 1:
|
||
raise ValueError("Can only specify one unknown dimension with -1")
|
||
|
||
# Calculate the unknown dimension
|
||
known_size = 1
|
||
unknown_idx = new_shape.index(-1)
|
||
for i, dim in enumerate(new_shape):
|
||
if i != unknown_idx:
|
||
known_size *= dim
|
||
|
||
unknown_dim = self.size // known_size
|
||
new_shape = list(new_shape)
|
||
new_shape[unknown_idx] = unknown_dim
|
||
new_shape = tuple(new_shape)
|
||
|
||
# Validate total elements remain the same
|
||
if np.prod(new_shape) != self.size:
|
||
raise ValueError(
|
||
f"Cannot reshape tensor of size {self.size} to shape {new_shape}. "
|
||
f"Total elements must match: {self.size} ≠ {np.prod(new_shape)}. "
|
||
f"💡 HINT: Make sure new_shape dimensions multiply to {self.size}"
|
||
)
|
||
|
||
# Reshape the data (NumPy handles the memory layout efficiently)
|
||
reshaped_data = np.reshape(self.data, new_shape)
|
||
return Tensor(reshaped_data)
|
||
### END SOLUTION
|
||
|
||
def transpose(self, dim0=None, dim1=None):
|
||
"""
|
||
Transpose tensor dimensions.
|
||
|
||
TODO: Implement tensor transposition
|
||
|
||
APPROACH:
|
||
1. Handle default case (transpose last two dimensions)
|
||
2. Handle specific dimension swapping
|
||
3. Use NumPy's transpose with proper axis specification
|
||
4. Return new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> matrix = Tensor([[1, 2, 3], [4, 5, 6]]) # (2, 3)
|
||
>>> transposed = matrix.transpose() # (3, 2)
|
||
>>> print(transposed.data)
|
||
[[1. 4.]
|
||
[2. 5.]
|
||
[3. 6.]]
|
||
|
||
NEURAL NETWORK USAGE:
|
||
>>> # Weight matrix transpose for backward pass
|
||
>>> W = Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) # (3, 2)
|
||
>>> W_T = W.transpose() # (2, 3) - for gradient computation
|
||
>>>
|
||
>>> # Attention mechanism
|
||
>>> Q = Tensor([[1, 2], [3, 4]]) # queries (2, 2)
|
||
>>> K = Tensor([[5, 6], [7, 8]]) # keys (2, 2)
|
||
>>> attention_scores = Q.matmul(K.transpose()) # Q @ K^T
|
||
|
||
HINTS:
|
||
- Default: transpose last two dimensions (most common case)
|
||
- Use np.transpose() with axes parameter
|
||
- Handle 1D tensors gracefully (transpose is identity)
|
||
"""
|
||
### BEGIN SOLUTION
|
||
if dim0 is None and dim1 is None:
|
||
# Default: transpose last two dimensions
|
||
if len(self.shape) < 2:
|
||
# For 1D tensors, transpose is identity operation
|
||
return Tensor(self.data.copy())
|
||
else:
|
||
# Transpose last two dimensions (most common in ML)
|
||
axes = list(range(len(self.shape)))
|
||
axes[-2], axes[-1] = axes[-1], axes[-2]
|
||
transposed_data = np.transpose(self.data, axes)
|
||
else:
|
||
# Specific dimensions to transpose
|
||
if dim0 is None or dim1 is None:
|
||
raise ValueError("Both dim0 and dim1 must be specified for specific dimension transpose")
|
||
|
||
# Validate dimensions exist
|
||
if dim0 >= len(self.shape) or dim1 >= len(self.shape) or dim0 < 0 or dim1 < 0:
|
||
raise ValueError(
|
||
f"Dimension out of range for tensor with shape {self.shape}. "
|
||
f"Got dim0={dim0}, dim1={dim1}, but tensor has {len(self.shape)} dimensions."
|
||
)
|
||
|
||
# Create axes list and swap the specified dimensions
|
||
axes = list(range(len(self.shape)))
|
||
axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
|
||
transposed_data = np.transpose(self.data, axes)
|
||
|
||
return Tensor(transposed_data)
|
||
### END SOLUTION
|
||
|
||
# nbgrader={"grade": false, "grade_id": "reduction-ops", "solution": true}
|
||
def sum(self, axis=None, keepdims=False):
|
||
"""
|
||
Sum tensor along specified axis.
|
||
|
||
TODO: Implement tensor sum with axis control
|
||
|
||
APPROACH:
|
||
1. Use NumPy's sum with axis parameter
|
||
2. Handle axis=None (sum all elements) vs specific axis
|
||
3. Support keepdims to maintain shape for broadcasting
|
||
4. Return new Tensor with result
|
||
|
||
EXAMPLE:
|
||
>>> tensor = Tensor([[1, 2], [3, 4]])
|
||
>>> total = tensor.sum() # Sum all elements: 10
|
||
>>> col_sum = tensor.sum(axis=0) # Sum columns: [4, 6]
|
||
>>> row_sum = tensor.sum(axis=1) # Sum rows: [3, 7]
|
||
|
||
NEURAL NETWORK USAGE:
|
||
>>> # Batch loss computation
|
||
>>> batch_losses = Tensor([0.1, 0.3, 0.2, 0.4]) # Individual losses
|
||
>>> total_loss = batch_losses.sum() # Total: 1.0
|
||
>>> avg_loss = batch_losses.mean() # Average: 0.25
|
||
>>>
|
||
>>> # Global average pooling
|
||
>>> feature_maps = Tensor(np.random.rand(32, 256, 7, 7)) # (batch, channels, h, w)
|
||
>>> global_features = feature_maps.sum(axis=(2, 3)) # (batch, channels)
|
||
|
||
HINTS:
|
||
- np.sum handles all the complexity for us
|
||
- axis=None sums all elements (returns scalar)
|
||
- axis=0 sums along first dimension, axis=1 along second, etc.
|
||
- keepdims=True preserves dimensions for broadcasting
|
||
"""
|
||
### BEGIN SOLUTION
|
||
result = np.sum(self.data, axis=axis, keepdims=keepdims)
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def mean(self, axis=None, keepdims=False):
|
||
"""
|
||
Compute mean of tensor along specified axis.
|
||
|
||
Common usage: Batch normalization, loss averaging, global pooling.
|
||
"""
|
||
### BEGIN SOLUTION
|
||
result = np.mean(self.data, axis=axis, keepdims=keepdims)
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def max(self, axis=None, keepdims=False):
|
||
"""
|
||
Find maximum values along specified axis.
|
||
|
||
Common usage: Max pooling, finding best predictions, activation clipping.
|
||
"""
|
||
### BEGIN SOLUTION
|
||
result = np.max(self.data, axis=axis, keepdims=keepdims)
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
# nbgrader={"grade": false, "grade_id": "gradient-placeholder", "solution": true}
|
||
def backward(self):
|
||
"""
|
||
Compute gradients (implemented in Module 05: Autograd).
|
||
|
||
TODO: Placeholder implementation for gradient computation
|
||
|
||
STUDENT NOTE:
|
||
This method exists but does nothing until Module 05: Autograd.
|
||
Don't worry about it for now - focus on the basic tensor operations.
|
||
|
||
In Module 05, we'll implement:
|
||
- Gradient computation via chain rule
|
||
- Automatic differentiation
|
||
- Backpropagation through operations
|
||
- Computation graph construction
|
||
|
||
FUTURE IMPLEMENTATION PREVIEW:
|
||
```python
|
||
def backward(self, gradient=None):
|
||
# Module 05 will implement:
|
||
# 1. Set gradient for this tensor
|
||
# 2. Propagate to parent operations
|
||
# 3. Apply chain rule recursively
|
||
# 4. Accumulate gradients properly
|
||
pass
|
||
```
|
||
|
||
CURRENT BEHAVIOR:
|
||
>>> x = Tensor([1, 2, 3], requires_grad=True)
|
||
>>> y = x * 2
|
||
>>> y.sum().backward() # Calls this method - does nothing
|
||
>>> print(x.grad) # Still None
|
||
None
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Placeholder - will be implemented in Module 05
|
||
# For now, just ensure it doesn't crash when called
|
||
# This allows students to experiment with gradient syntax
|
||
# without getting confusing errors about missing methods
|
||
pass
|
||
### END SOLUTION
|