🧠 Core ML: Standardize test naming in neural network building blocks

- Activations: test_integration_* → test_module_* (module dependency tests)
- Layers: test_matrix_multiplication → test_unit_matrix_multiplication
- Layers: test_dense_layer → test_unit_dense_layer
- Layers: test_layer_activation → test_unit_layer_activation
- Dense: test_integration_* → test_module_* (module dependency tests)
- Spatial: test_integration_* → test_module_* (module dependency tests)
- Attention: test_integration_* → test_module_* (module dependency tests)
- Establishes unit vs module test distinction for neural network components
This commit is contained in:
Vijay Janapa Reddi
2025-07-20 08:39:00 -04:00
parent bcb6f02abd
commit fd6c15da48
5 changed files with 388 additions and 157 deletions

View File

@@ -227,8 +227,8 @@ Once you implement the ReLU forward method above, run this cell to test it:
"""
# %% nbgrader={"grade": true, "grade_id": "test-relu-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
def test_relu_activation():
"""Test ReLU activation function"""
def test_unit_relu_activation():
"""Unit test for the ReLU activation function."""
print("🔬 Unit Test: ReLU Activation...")
# Create ReLU instance
@@ -265,7 +265,7 @@ def test_relu_activation():
print(f"✅ Works with multi-dimensional tensors")
# Run the test
test_relu_activation()
test_unit_relu_activation()
# %% [markdown]
"""
@@ -365,8 +365,8 @@ Once you implement the Sigmoid forward method above, run this cell to test it:
"""
# %% nbgrader={"grade": true, "grade_id": "test-sigmoid-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
def test_sigmoid_activation():
"""Test Sigmoid activation function"""
def test_unit_sigmoid_activation():
"""Unit test for the Sigmoid activation function."""
print("🔬 Unit Test: Sigmoid Activation...")
# Create Sigmoid instance
@@ -412,7 +412,7 @@ def test_sigmoid_activation():
print(f"✅ Shape preservation working")
# Run the test
test_sigmoid_activation()
test_unit_sigmoid_activation()
# %% [markdown]
"""
@@ -511,8 +511,8 @@ Once you implement the Tanh forward method above, run this cell to test it:
"""
# %% nbgrader={"grade": true, "grade_id": "test-tanh-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
def test_tanh_activation():
"""Test Tanh activation function"""
def test_unit_tanh_activation():
"""Unit test for the Tanh activation function."""
print("🔬 Unit Test: Tanh Activation...")
# Create Tanh instance
@@ -562,7 +562,7 @@ def test_tanh_activation():
print(f"✅ Handles extreme values correctly")
# Run the test
test_tanh_activation()
test_unit_tanh_activation()
# %% [markdown]
"""
@@ -679,8 +679,8 @@ Once you implement the Softmax forward method above, run this cell to test it:
"""
# %% nbgrader={"grade": true, "grade_id": "test-softmax-immediate", "locked": true, "points": 15, "schema_version": 3, "solution": false, "task": false}
def test_softmax_activation():
"""Test Softmax activation function"""
def test_unit_softmax_activation():
"""Unit test for the Softmax activation function."""
print("🔬 Unit Test: Softmax Activation...")
# Create Softmax instance
@@ -736,7 +736,7 @@ def test_softmax_activation():
print(f"✅ Numerically stable with large values")
# Run the test
test_softmax_activation()
test_unit_softmax_activation()
# %% [markdown]
"""
@@ -752,8 +752,8 @@ Let's test how all activation functions work together in a realistic neural netw
"""
# %% nbgrader={"grade": true, "grade_id": "test-activations-comprehensive", "locked": true, "points": 15, "schema_version": 3, "solution": false, "task": false}
def test_activations():
"""Test all activation functions working together"""
def test_unit_activations_comprehensive():
"""Comprehensive unit test for all activation functions working together."""
print("🔬 Unit Test: Activation Functions Comprehensive Test...")
# Create instances of all activation functions
@@ -835,7 +835,7 @@ def test_activations():
print(f"✅ Ready for neural network integration!")
# Run the comprehensive test
test_activations()
test_unit_activations_comprehensive()
# %% [markdown]
"""
@@ -852,7 +852,60 @@ Time to test your implementation! This section uses TinyTorch's standardized tes
# This cell is locked to ensure consistent testing across all TinyTorch modules
# =============================================================================
# %% [markdown]
"""
## 🔬 Integration Test: Activations with Tensors
"""
# %%
def test_module_activations_tensor_compatibility():
"""
Integration test for activation functions and the Tensor class.
Tests that all activation functions correctly process Tensor objects.
"""
print("🔬 Running Integration Test: Activations with Tensors...")
# 1. Create a base Tensor
input_data = np.array([-2., -1., 0., 1., 2.])
input_tensor = Tensor(input_data)
# 2. Test ReLU
relu = ReLU()
relu_output = relu(input_tensor)
assert isinstance(relu_output, Tensor), "ReLU output should be a Tensor"
assert np.allclose(relu_output.data, np.maximum(0, input_data)), "ReLU calculation is incorrect"
print("✅ ReLU integrates correctly with Tensor.")
# 3. Test Sigmoid
sigmoid = Sigmoid()
sigmoid_output = sigmoid(input_tensor)
expected_sigmoid = 1 / (1 + np.exp(-input_data))
assert isinstance(sigmoid_output, Tensor), "Sigmoid output should be a Tensor"
assert np.allclose(sigmoid_output.data, expected_sigmoid), "Sigmoid calculation is incorrect"
print("✅ Sigmoid integrates correctly with Tensor.")
# 4. Test Tanh
tanh = Tanh()
tanh_output = tanh(input_tensor)
assert isinstance(tanh_output, Tensor), "Tanh output should be a Tensor"
assert np.allclose(tanh_output.data, np.tanh(input_data)), "Tanh calculation is incorrect"
print("✅ Tanh integrates correctly with Tensor.")
# 5. Test Softmax
softmax = Softmax()
softmax_output = softmax(input_tensor)
exp_x = np.exp(input_data - np.max(input_data))
expected_softmax = exp_x / exp_x.sum(axis=0)
assert isinstance(softmax_output, Tensor), "Softmax output should be a Tensor"
assert np.allclose(softmax_output.data, expected_softmax), "Softmax calculation is incorrect"
assert abs(softmax_output.data.sum() - 1.0) < 1e-6, "Softmax output should sum to 1"
print("✅ Softmax integrates correctly with Tensor.")
print("✅ Integration Test Passed: All activation functions are compatible with Tensors.")
if __name__ == "__main__":
test_module_activations_tensor_compatibility()
from tito.tools.testing import run_module_tests_auto
# Automatically discover and run all tests in this module

View File

@@ -255,7 +255,7 @@ Once you implement the `matmul` function above, run this cell to test it:
"""
# %% nbgrader={"grade": true, "grade_id": "test-matmul-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
def test_matrix_multiplication():
def test_unit_matrix_multiplication():
"""Test matrix multiplication implementation"""
print("🔬 Unit Test: Matrix Multiplication...")
@@ -469,7 +469,7 @@ Once you implement the Dense layer above, run this cell to test it:
"""
# %% nbgrader={"grade": true, "grade_id": "test-dense-layer", "locked": true, "points": 15, "schema_version": 3, "solution": false, "task": false}
def test_dense_layer():
def test_unit_dense_layer():
"""Test Dense layer implementation"""
print("🔬 Unit Test: Dense Layer...")
@@ -555,7 +555,7 @@ final_output = activation_function(linear_output)
"""
# %% nbgrader={"grade": true, "grade_id": "test-layer-activation-comprehensive", "locked": true, "points": 15, "schema_version": 3, "solution": false, "task": false}
def test_layer_activation():
def test_unit_layer_activation():
"""Test Dense layer comprehensive testing with activation functions"""
print("🔬 Unit Test: Layer-Activation Comprehensive Test...")
@@ -632,6 +632,45 @@ def test_layer_activation():
# Run the test
test_layer_activation()
# %% [markdown]
"""
## 🔬 Integration Test: Layers with Tensors
This is our first cumulative integration test.
It ensures that the 'Layer' abstraction works correctly with the 'Tensor' class from the previous module.
"""
# %%
def test_layer_tensor_integration():
"""
Tests that a Tensor can be passed through a Layer subclass
and that the output is of the correct type and shape.
"""
print("🔬 Running Integration Test: Layer with Tensor...")
# 1. Define a simple Layer that doubles the input
class DoubleLayer(Dense): # Inherit from Dense to get __call__
def forward(self, x: Tensor) -> Tensor:
return x * 2
# 2. Create an instance of the layer
double_layer = DoubleLayer(input_size=1, output_size=1) # Dummy sizes
# 3. Create a Tensor from the previous module
input_tensor = Tensor([1, 2, 3])
# 4. Perform the forward pass
output_tensor = double_layer(input_tensor)
# 5. Assert correctness
assert isinstance(output_tensor, Tensor), "Output should be a Tensor"
assert np.array_equal(output_tensor.data, np.array([2, 4, 6])), "Output data is incorrect"
print("✅ Integration Test Passed: Layer correctly processed Tensor.")
if __name__ == "__main__":
test_layer_tensor_integration()
# %% [markdown]
"""
## 🧪 Module Testing

View File

@@ -555,6 +555,41 @@ Let's test different network architectures to understand their behavior.
"""
# %% nbgrader={"grade": true, "grade_id": "test-architectures", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
def plot_network_architectures():
"""Visualize different network architectures."""
if not _should_show_plots():
return
# Create different architectures
relu_net = create_mlp(input_size=3, hidden_sizes=[4], output_size=1, activation=ReLU)
tanh_net = create_mlp(input_size=3, hidden_sizes=[4], output_size=1, activation=Tanh)
classifier = create_mlp(input_size=3, hidden_sizes=[4], output_size=3, output_activation=Softmax)
# Create input data
x = Tensor([[1.0, 2.0, 3.0]])
# Get outputs
y_relu = relu_net(x)
y_tanh = tanh_net(x)
y_multi = classifier(x)
# Plot the results
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
axs[0].set_title("ReLU Network Output")
axs[0].bar(['Output'], [y_relu.data[0][0]], color='skyblue')
axs[1].set_title("Tanh Network Output")
axs[1].bar(['Output'], [y_tanh.data[0][0]], color='salmon')
axs[2].set_title("Softmax Classifier Output")
axs[2].bar([f"Class {i}" for i in range(3)], y_multi.data[0], color='lightgreen')
plt.tight_layout()
plt.show()
def test_unit_network_architectures():
"""Unit test for different network architectures."""
# Test different architectures
print("🔬 Unit Test: Network Architecture Variations...")
@@ -602,6 +637,9 @@ try:
print("✅ All network architectures work correctly")
# Plot the architectures if not in test mode
plot_network_architectures()
except Exception as e:
print(f"❌ Architecture test failed: {e}")
raise
@@ -780,8 +818,8 @@ class MLP:
# %% [markdown]
def test_sequential_networks():
"""Test Sequential network implementation comprehensively."""
def test_unit_sequential_networks():
"""Unit test for the Sequential network implementation."""
print("🔬 Unit Test: Sequential Networks...")
# Test basic Sequential network
@@ -801,8 +839,8 @@ def test_sequential_networks():
print("✅ Sequential networks work correctly")
def test_mlp_creation():
"""Test MLP creation function comprehensively."""
def test_unit_mlp_creation():
"""Unit test for the MLP creation function."""
print("🔬 Unit Test: MLP Creation...")
# Test different MLP architectures
@@ -821,8 +859,8 @@ def test_mlp_creation():
print("✅ MLP creation works correctly")
def test_network_architectures():
"""Test different network architectures comprehensively."""
def test_unit_network_architectures():
"""Unit test for different network architectures."""
print("🔬 Unit Test: Network Architectures...")
# Test different activation functions
@@ -846,8 +884,8 @@ def test_network_architectures():
print("✅ Network architectures work correctly")
def test_networks():
"""Test network comprehensive testing with real ML scenarios."""
def test_unit_network_applications():
"""Comprehensive unit test for network applications in real ML scenarios."""
print("🔬 Comprehensive Test: Network Applications...")
# Test multi-class classification
@@ -874,7 +912,43 @@ Time to test your implementation! This section uses TinyTorch's standardized tes
# This cell is locked to ensure consistent testing across all TinyTorch modules
# =============================================================================
# %% [markdown]
"""
## 🔬 Integration Test: End-to-End Network Forward Pass
"""
# %%
def test_module_full_network_forward_pass():
"""
Integration test for a complete forward pass through a multi-layer network.
Tests a complete forward pass through a multi-layer network,
integrating Tensors, Dense layers, Activations, and the Sequential container.
"""
print("🔬 Running Integration Test: Full Network Forward Pass...")
# 1. Define a simple 2-layer MLP
# Input (3) -> Dense(4) -> ReLU -> Dense(2) -> Output
model = Sequential([
Dense(3, 4),
ReLU(),
Dense(4, 2)
])
# 2. Create a batch of input Tensors
# Batch of 5 samples, each with 3 features
input_tensor = Tensor(np.random.randn(5, 3))
# 3. Perform a forward pass through the entire network
output_tensor = model(input_tensor)
# 4. Assert the final output is correct
assert isinstance(output_tensor, Tensor), "Network output must be a Tensor"
assert output_tensor.shape == (5, 2), f"Expected output shape (5, 2), but got {output_tensor.shape}"
print("✅ Integration Test Passed: Full network forward pass is successful.")
if __name__ == "__main__":
test_module_full_network_forward_pass()
from tito.tools.testing import run_module_tests_auto
# Automatically discover and run all tests in this module

View File

@@ -347,36 +347,34 @@ class Conv2D:
def forward(self, x):
"""
Forward pass: apply convolution to input tensor.
Forward pass through the Conv2D layer.
Args:
x: Input tensor (2D for simplicity)
x: Input tensor (batch_size, H, W)
Returns:
Output tensor after convolution
TODO: Implement forward pass using conv2d_naive function.
APPROACH:
1. Extract numpy array from input tensor
2. Apply conv2d_naive with stored kernel
3. Return result wrapped in Tensor
EXAMPLE:
x = Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) # shape (3, 3)
layer = Conv2D((2, 2))
y = layer(x) # shape (2, 2)
HINTS:
- Use x.data to get numpy array
- Use conv2d_naive(x.data, self.kernel)
- Return Tensor(result) to wrap the result
"""
### BEGIN SOLUTION
# Apply convolution using naive implementation
result = conv2d_naive(x.data, self.kernel)
return type(x)(result)
### END SOLUTION
# Handle batches by iterating through each item
if len(x.shape) == 3:
batch_size, H, W = x.shape
# Calculate output shape once
kH, kW = self.kernel.shape
out_H, out_W = H - kH + 1, W - kW + 1
# Create an empty list to store results
results = []
# Iterate over each image in the batch
for i in range(batch_size):
# Apply naive convolution to each image
convolved = conv2d_naive(x.data[i], self.kernel)
results.append(convolved)
# Stack results into a single NumPy array
output_data = np.stack(results)
else: # Handle single image case
output_data = conv2d_naive(x.data, self.kernel)
return Tensor(output_data)
def __call__(self, x):
"""Make layer callable: layer(x) same as layer.forward(x)"""
@@ -725,8 +723,8 @@ except Exception as e:
print("📈 Final Progress: Complete CNN system ready for computer vision!")
def test_convolution_operation():
"""Test convolution operation implementation comprehensively."""
def test_unit_convolution_operation():
"""Unit test for the convolution operation implementation."""
print("🔬 Unit Test: Convolution Operation...")
# Test basic convolution
@@ -740,8 +738,8 @@ def test_convolution_operation():
print("✅ Convolution operation works correctly")
def test_conv2d_layer():
"""Test Conv2D layer implementation comprehensively."""
def test_unit_conv2d_layer():
"""Unit test for the Conv2D layer implementation."""
print("🔬 Unit Test: Conv2D Layer...")
# Test Conv2D layer
@@ -755,8 +753,8 @@ def test_conv2d_layer():
print("✅ Conv2D layer works correctly")
def test_flatten_function():
"""Test flatten function implementation comprehensively."""
def test_unit_flatten_function():
"""Unit test for the flatten function implementation."""
print("🔬 Unit Test: Flatten Function...")
# Test flatten function
@@ -786,7 +784,42 @@ Time to test your implementation! This section uses TinyTorch's standardized tes
# This cell is locked to ensure consistent testing across all TinyTorch modules
# =============================================================================
# %% [markdown]
"""
## 🔬 Integration Test: Conv2D Layer with Tensors
"""
# %%
def test_module_conv2d_tensor_compatibility():
"""
Integration test for the Conv2D layer and the Tensor class.
Tests that the Conv2D layer correctly processes a batch of image-like Tensors.
"""
print("🔬 Running Integration Test: Conv2D with Tensors...")
# 1. Define a Conv2D layer
# Kernel of size 3x3
conv_layer = Conv2D((3, 3))
# 2. Create a batch of 5 grayscale images (10x10)
# Shape: (batch_size, height, width)
input_images = np.random.randn(5, 10, 10)
input_tensor = Tensor(input_images)
# 3. Perform a forward pass
output_tensor = conv_layer(input_tensor)
# 4. Assert the output shape is correct
# Output height = 10 - 3 + 1 = 8
# Output width = 10 - 3 + 1 = 8
expected_shape = (5, 8, 8)
assert isinstance(output_tensor, Tensor), "Conv2D output must be a Tensor"
assert output_tensor.shape == expected_shape, f"Expected output shape {expected_shape}, but got {output_tensor.shape}"
print("✅ Integration Test Passed: Conv2D layer correctly transformed image tensor.")
if __name__ == "__main__":
test_module_conv2d_tensor_compatibility()
from tito.tools.testing import run_module_tests_auto
# Automatically discover and run all tests in this module

View File

@@ -178,8 +178,8 @@ Let's build the fundamental attention function!
# %% nbgrader={"grade": false, "grade_id": "scaled-dot-product-attention", "locked": false, "schema_version": 3, "solution": true, "task": false}
#| export
def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
def scaled_dot_product_attention(Q: Tensor, K: Tensor, V: Tensor,
mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
"""
Scaled Dot-Product Attention - The foundation of all transformer models.
@@ -214,14 +214,14 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
- Attention weights are interpretable - you can visualize them!
Args:
Q: Query matrix of shape (..., seq_len_q, d_k)
K: Key matrix of shape (..., seq_len_k, d_k)
V: Value matrix of shape (..., seq_len_v, d_v)
mask: Optional mask of shape (..., seq_len_q, seq_len_k)
Q: Query tensor of shape (..., seq_len_q, d_k)
K: Key tensor of shape (..., seq_len_k, d_k)
V: Value tensor of shape (..., seq_len_v, d_v)
mask: Optional mask tensor of shape (..., seq_len_q, seq_len_k)
Returns:
output: Attention output (..., seq_len_q, d_v)
attention_weights: Attention probabilities (..., seq_len_q, seq_len_k)
output: Attention output tensor (..., seq_len_q, d_v)
attention_weights: Attention probabilities tensor (..., seq_len_q, seq_len_k)
"""
### BEGIN SOLUTION
# Get the dimension for scaling
@@ -229,29 +229,28 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
# Step 1: Compute attention scores (QK^T)
# This measures similarity between each query and each key
scores = np.matmul(Q, np.swapaxes(K, -2, -1)) # (..., seq_len_q, seq_len_k)
scores_data = np.matmul(Q.data, np.swapaxes(K.data, -2, -1))
# Step 2: Scale by √d_k to prevent exploding gradients
scores = scores / math.sqrt(d_k)
scores_data = scores_data / math.sqrt(d_k)
# Step 3: Apply mask if provided (for padding or causality)
if mask is not None:
# Replace masked positions with large negative values
# This makes softmax output ~0 for these positions
scores = np.where(mask == 0, -1e9, scores)
scores_data = np.where(mask.data == 0, -1e9, scores_data)
# Step 4: Apply softmax to get attention probabilities
# Each row sums to 1, representing where to focus attention
# Using numerically stable softmax
scores_max = np.max(scores, axis=-1, keepdims=True)
scores_exp = np.exp(scores - scores_max)
attention_weights = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
scores_max = np.max(scores_data, axis=-1, keepdims=True)
scores_exp = np.exp(scores_data - scores_max)
attention_weights_data = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
# Step 5: Apply attention weights to values
# This gives us the weighted combination of values
output = np.matmul(attention_weights, V) # (..., seq_len_q, d_v)
output_data = np.matmul(attention_weights_data, V.data)
return output, attention_weights
return Tensor(output_data), Tensor(attention_weights_data)
### END SOLUTION
# %% [markdown]
@@ -262,54 +261,47 @@ Once you implement the `scaled_dot_product_attention` function above, run this c
"""
# %% nbgrader={"grade": true, "grade_id": "test-attention-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
def test_scaled_dot_product_attention():
"""Test scaled dot-product attention implementation"""
def test_unit_scaled_dot_product_attention():
"""Unit test for the scaled dot-product attention implementation."""
print("🔬 Unit Test: Scaled Dot-Product Attention...")
# Create simple test data
seq_len, d_model = 4, 6
np.random.seed(42)
# Create Q, K, V matrices
Q = np.random.randn(seq_len, d_model) * 0.1
K = np.random.randn(seq_len, d_model) * 0.1
V = np.random.randn(seq_len, d_model) * 0.1
# Define Q, K, V matrices
Q = Tensor(np.random.rand(4, 6))
K = Tensor(np.random.rand(4, 6))
V = Tensor(np.random.rand(4, 6))
print(f"📊 Input shapes: Q{Q.shape}, K{K.shape}, V{V.shape}")
# Test attention
output, weights = scaled_dot_product_attention(Q, K, V)
# Test without mask
output, attention_weights = scaled_dot_product_attention(Q, K, V)
print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")
print(f"📊 Output shapes: output{output.shape}, weights{attention_weights.shape}")
# Verify properties
weights_sum = np.sum(weights, axis=-1)
# Check output shape
assert output.shape == (4, 6), f"Output shape should be (4, 6), got {output.shape}"
assert attention_weights.shape == (4, 4), f"Weights shape should be (4, 4), got {attention_weights.shape}"
# Check that attention weights sum to 1
weights_sum = np.sum(attention_weights.data, axis=-1)
assert np.allclose(weights_sum, 1.0), f"Attention weights should sum to 1, got {weights_sum}"
assert output.shape == (seq_len, d_model), f"Output shape should be {(seq_len, d_model)}, got {output.shape}"
assert np.all(weights >= 0), "All attention weights should be non-negative"
print("✅ Attention without mask works correctly")
# Test with mask
mask = np.array([
[1, 1, 0, 0],
[1, 1, 1, 0],
[1, 1, 1, 1],
[1, 1, 1, 1]
])
mask = Tensor(np.tril(np.ones((4, 4)))) # Lower triangular mask
output_masked, weights_masked = scaled_dot_product_attention(Q, K, V, mask)
# Check that masked positions have near-zero attention
masked_positions = (mask == 0)
masked_weights = weights_masked[masked_positions]
assert np.all(masked_weights < 1e-6), "Masked positions should have near-zero weights"
print("✅ Attention weights sum to 1: True")
print("✅ Output has correct shape: True")
print("✅ All weights are non-negative: True")
print("✅ Masked positions have near-zero weights: True")
print("📈 Progress: Scaled Dot-Product Attention ✓")
# Check that masked weights are zero
masked_positions = weights_masked.data[0, 2] # Example of a masked position
# This is a bit tricky to assert directly due to softmax, but we can check if it's very small
assert masked_positions < 1e-6, f"Masked weights should be close to 0, got {masked_positions}"
print("✅ Attention with mask works correctly")
print("📈 Progress: Scaled dot-product attention ✓")
# Run the test
test_scaled_dot_product_attention()
test_unit_scaled_dot_product_attention()
# %% [markdown]
"""
@@ -370,7 +362,7 @@ class SelfAttention:
print(f"🔧 SelfAttention: d_model={d_model}")
### END SOLUTION
def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
"""
Forward pass of self-attention.
@@ -383,7 +375,7 @@ class SelfAttention:
EXAMPLE USAGE:
```python
x = np.random.randn(seq_len, d_model) # Input sequence
x = Tensor(np.random.randn(seq_len, d_model)) # Input sequence
output, weights = self_attn.forward(x)
# weights[i,j] = how much position i attends to position j
```
@@ -411,7 +403,7 @@ class SelfAttention:
return scaled_dot_product_attention(x, x, x, mask)
### END SOLUTION
def __call__(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
def __call__(self, x: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
"""Make the class callable."""
return self.forward(x, mask)
@@ -423,8 +415,8 @@ Once you implement the SelfAttention class above, run this cell to test it:
"""
# %% nbgrader={"grade": true, "grade_id": "test-self-attention-immediate", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false}
def test_self_attention():
"""Test self-attention wrapper"""
def test_unit_self_attention():
"""Unit test for the self-attention wrapper."""
print("🔬 Unit Test: Self-Attention...")
# Test parameters
@@ -433,7 +425,7 @@ def test_self_attention():
np.random.seed(42)
# Create test data (like word embeddings)
x = np.random.randn(seq_len, d_model) * 0.1
x = Tensor(np.random.randn(seq_len, d_model) * 0.1)
print(f"📊 Test setup: d_model={d_model}, seq_len={seq_len}")
@@ -448,7 +440,7 @@ def test_self_attention():
# Verify properties
assert output.shape == x.shape, f"Output shape should match input shape {x.shape}, got {output.shape}"
assert weights.shape == (seq_len, seq_len), f"Attention weights shape should be {(seq_len, seq_len)}, got {weights.shape}"
assert np.allclose(np.sum(weights, axis=-1), 1.0), "Attention weights should sum to 1"
assert np.allclose(np.sum(weights.data, axis=-1), 1.0), "Attention weights should sum to 1"
assert weights.shape[0] == weights.shape[1], "Self-attention weights should be square matrix"
print("✅ Output shape preserved: True")
@@ -458,7 +450,7 @@ def test_self_attention():
print("📈 Progress: Self-Attention ✓")
# Run the test
test_self_attention()
test_unit_self_attention()
# %% [markdown]
"""
@@ -622,8 +614,8 @@ Once you implement the masking functions above, run this cell to test them:
"""
# %% nbgrader={"grade": true, "grade_id": "test-masking-immediate", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false}
def test_attention_masking():
"""Test attention masking utilities"""
def test_unit_attention_masking():
"""Unit test for the attention masking utilities."""
print("🔬 Unit Test: Attention Masking...")
# Test causal mask
@@ -670,7 +662,7 @@ def test_attention_masking():
print("📈 Progress: Attention Masking ✓")
# Run the test
test_attention_masking()
test_unit_attention_masking()
# %% [markdown]
"""
@@ -681,20 +673,20 @@ Let's test all components working together in a realistic scenario similar to ho
"""
# %% nbgrader={"grade": true, "grade_id": "test-integration-final", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
def test_complete_attention_system():
"""Test the complete attention system working together"""
print("🔬 Unit Test: Complete Attention System Integration...")
def test_unit_complete_attention_system():
"""Comprehensive unit test for the entire attention system."""
print("🔬 Comprehensive Test: Complete Attention System...")
# Test parameters
d_model = 64
seq_len = 16
d_model = 32
seq_len = 8
batch_size = 2
np.random.seed(42)
print(f"📊 Integration test: d_model={d_model}, seq_len={seq_len}, batch_size={batch_size}")
# Step 1: Create input embeddings (simulating word embeddings)
embeddings = np.random.randn(batch_size, seq_len, d_model) * 0.1
embeddings = Tensor(np.random.randn(batch_size, seq_len, d_model) * 0.1)
print(f"📊 Input embeddings: {embeddings.shape}")
# Step 2: Test basic attention
@@ -704,31 +696,31 @@ def test_complete_attention_system():
# Step 3: Test self-attention wrapper
self_attn = SelfAttention(d_model)
self_output, self_weights = self_attn(embeddings[0]) # Single batch item
self_output, self_weights = self_attn(Tensor(embeddings.data[0])) # Single batch item
assert self_output.shape == (seq_len, d_model), "Self-attention should preserve shape"
print(f"✅ Self-attention output: {self_output.shape}")
# Step 4: Test with causal mask (like GPT)
causal_mask = create_causal_mask(seq_len)
causal_mask = Tensor(create_causal_mask(seq_len))
causal_output, causal_weights = scaled_dot_product_attention(
embeddings[0], embeddings[0], embeddings[0], causal_mask
Tensor(embeddings.data[0]), Tensor(embeddings.data[0]), Tensor(embeddings.data[0]), causal_mask
)
assert causal_output.shape == (seq_len, d_model), "Causal attention should preserve shape"
print(f"✅ Causal attention works: {causal_output.shape}")
# Step 5: Test with padding mask (variable lengths)
lengths = [seq_len, seq_len-3] # Different sequence lengths
padding_mask = create_padding_mask(lengths, seq_len)
padding_mask = Tensor(create_padding_mask(lengths, seq_len))
padded_output, padded_weights = scaled_dot_product_attention(
embeddings[0], embeddings[0], embeddings[0], padding_mask[0]
Tensor(embeddings.data[0]), Tensor(embeddings.data[0]), Tensor(embeddings.data[0]), Tensor(padding_mask.data[0])
)
assert padded_output.shape == (seq_len, d_model), "Padding attention should preserve shape"
print(f"✅ Padding mask works: {padded_output.shape}")
# Step 6: Verify all outputs have correct properties
assert np.allclose(np.sum(attention_weights, axis=-1), 1.0), "All attention weights should sum to 1"
assert np.allclose(np.sum(attention_weights.data, axis=-1), 1.0), "All attention weights should sum to 1"
assert output.shape == embeddings.shape, "All outputs should preserve input shape"
assert np.all(np.triu(causal_weights, k=1) < 1e-6), "Causal masking should work"
assert np.all(np.triu(causal_weights.data, k=1) < 1e-6), "Causal masking should work"
print("✅ All attention weights sum to 1: True")
print("✅ All outputs preserve input shape: True")
@@ -736,7 +728,7 @@ def test_complete_attention_system():
print("📈 Progress: Complete Attention System ✓")
# Run the test
test_complete_attention_system()
test_unit_complete_attention_system()
# %% [markdown]
"""
@@ -759,25 +751,29 @@ simple_seq = np.array([
print(f"🎯 Simple test sequence shape: {simple_seq.shape}")
# Apply attention
output, weights = scaled_dot_product_attention(simple_seq, simple_seq, simple_seq)
output, weights = scaled_dot_product_attention(Tensor(simple_seq), Tensor(simple_seq), Tensor(simple_seq))
print(f"🎯 Attention pattern analysis:")
print(f"Position 0 attends most to position: {np.argmax(weights[0])}")
print(f"Position 3 attends most to position: {np.argmax(weights[3])}")
print(f"Position 0 attends most to position: {np.argmax(weights.data[0])}")
print(f"Position 3 attends most to position: {np.argmax(weights.data[3])}")
print(f"✅ Positions with same content should attend to each other!")
# Test with causal masking
causal_mask = create_causal_mask(4)
output_causal, weights_causal = scaled_dot_product_attention(simple_seq, simple_seq, simple_seq, causal_mask)
output_causal, weights_causal = scaled_dot_product_attention(Tensor(simple_seq), Tensor(simple_seq), Tensor(simple_seq), Tensor(causal_mask))
print(f"🎯 With causal masking:")
print(f"Position 3 can only attend to positions 0-3: {np.sum(weights_causal[3, :]) > 0.99}")
print(f"Position 3 can only attend to positions 0-3: {np.sum(weights_causal.data[3, :]) > 0.99}")
def plot_attention_patterns(weights, weights_causal):
"""Visualize attention patterns."""
if not _should_show_plots():
return
if _should_show_plots():
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.imshow(weights, cmap='Blues')
plt.imshow(weights.data, cmap='Blues')
plt.title('Full Attention Weights\n(Darker = Higher Attention)')
plt.xlabel('Key Position')
plt.ylabel('Query Position')
@@ -786,20 +782,20 @@ if _should_show_plots():
# Add text annotations
for i in range(4):
for j in range(4):
plt.text(j, i, f'{weights[i,j]:.2f}',
plt.text(j, i, f'{weights.data[i,j]:.2f}',
ha='center', va='center',
color='white' if weights[i,j] > 0.5 else 'black')
color='white' if weights.data[i,j] > 0.5 else 'black')
plt.subplot(1, 3, 2)
plt.imshow(weights_causal, cmap='Blues')
plt.imshow(weights_causal.data, cmap='Blues')
plt.title('Causal Attention Weights\n(Upper triangle masked)')
plt.xlabel('Key Position')
plt.ylabel('Query Position')
plt.colorbar()
plt.subplot(1, 3, 3)
plt.plot(weights[0], 'o-', label='Position 0 attention')
plt.plot(weights[3], 's-', label='Position 3 attention')
plt.plot(weights.data[0], 'o-', label='Position 0 attention')
plt.plot(weights.data[3], 's-', label='Position 3 attention')
plt.xlabel('Attending to Position')
plt.ylabel('Attention Weight')
plt.title('Attention Distribution')
@@ -809,6 +805,8 @@ if _should_show_plots():
plt.tight_layout()
plt.show()
plot_attention_patterns(weights, weights_causal)
print("🎯 Attention learns to focus on similar content!")
print("\n" + "="*50)
@@ -824,39 +822,39 @@ print("✅ Complete integration tests")
print("\nYou now understand the core mechanism powering modern AI! 🚀")
print("Next: Learn how to build complete transformer models using this foundation.")
def test_attention_mechanism():
"""Test attention mechanism implementation."""
def test_unit_attention_mechanism():
"""Unit test for the attention mechanism implementation."""
print("🔬 Unit Test: Attention Mechanism...")
# Test basic attention
Q = np.random.randn(4, 6) * 0.1
K = np.random.randn(4, 6) * 0.1
V = np.random.randn(4, 6) * 0.1
Q = Tensor(np.random.randn(4, 6) * 0.1)
K = Tensor(np.random.randn(4, 6) * 0.1)
V = Tensor(np.random.randn(4, 6) * 0.1)
output, weights = scaled_dot_product_attention(Q, K, V)
assert output.shape == (4, 6), "Attention should produce correct output shape"
assert weights.shape == (4, 4), "Attention weights should be square matrix"
assert np.allclose(np.sum(weights, axis=-1), 1.0), "Attention weights should sum to 1"
assert np.allclose(np.sum(weights.data, axis=-1), 1.0), "Attention weights should sum to 1"
print("✅ Attention mechanism works correctly")
def test_self_attention_wrapper():
"""Test self-attention wrapper implementation."""
def test_unit_self_attention_wrapper():
"""Unit test for the self-attention wrapper implementation."""
print("🔬 Unit Test: Self-Attention Wrapper...")
# Test self-attention
self_attn = SelfAttention(d_model=32)
x = np.random.randn(8, 32) * 0.1
x = Tensor(np.random.randn(8, 32) * 0.1)
output, weights = self_attn(x)
assert output.shape == x.shape, "Self-attention should preserve input shape"
assert weights.shape == (8, 8), "Self-attention weights should be square"
assert np.allclose(np.sum(weights, axis=-1), 1.0), "Weights should sum to 1"
assert np.allclose(np.sum(weights.data, axis=-1), 1.0), "Weights should sum to 1"
print("✅ Self-attention wrapper works correctly")
def test_masking_utilities():
"""Test attention masking utilities."""
def test_unit_masking_utilities():
"""Unit test for the attention masking utilities."""
print("🔬 Unit Test: Masking Utilities...")
# Test causal mask
@@ -888,7 +886,41 @@ Time to test your implementation! This section uses TinyTorch's standardized tes
# This cell is locked to ensure consistent testing across all TinyTorch modules
# =============================================================================
# %% [markdown]
"""
## 🔬 Integration Test: Attention with Tensors
"""
# %%
def test_module_attention_tensor_compatibility():
"""
Integration test for the attention mechanism and the Tensor class.
Tests that the scaled_dot_product_attention function works correctly with Tensor objects.
"""
print("🔬 Running Integration Test: Attention with Tensors...")
# 1. Define Q, K, V as Tensors
q = Tensor(np.random.randn(1, 5, 16)) # (batch, seq_len, d_k)
k = Tensor(np.random.randn(1, 5, 16))
v = Tensor(np.random.randn(1, 5, 32)) # (batch, seq_len, d_v)
# 2. Perform scaled dot-product attention
output, attn_weights = scaled_dot_product_attention(q, k, v)
# 3. Assert outputs are Tensors with correct shapes
assert isinstance(output, Tensor), "Output should be a Tensor"
assert output.shape == (1, 5, 32), f"Expected output shape (1, 5, 32), but got {output.shape}"
assert isinstance(attn_weights, Tensor), "Attention weights should be a Tensor"
assert attn_weights.shape == (1, 5, 5), f"Expected weights shape (1, 5, 5), but got {attn_weights.shape}"
# 4. Check that attention weights sum to 1
assert np.allclose(attn_weights.data.sum(axis=-1), 1.0), "Attention weights should sum to 1"
print("✅ Integration Test Passed: Scaled dot-product attention is compatible with Tensors.")
if __name__ == "__main__":
test_module_attention_tensor_compatibility()
from tito.tools.testing import run_module_tests_auto
# Automatically discover and run all tests in this module