mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-31 21:37:54 -05:00
🧠 Core ML: Standardize test naming in neural network building blocks
- Activations: test_integration_* → test_module_* (module dependency tests) - Layers: test_matrix_multiplication → test_unit_matrix_multiplication - Layers: test_dense_layer → test_unit_dense_layer - Layers: test_layer_activation → test_unit_layer_activation - Dense: test_integration_* → test_module_* (module dependency tests) - Spatial: test_integration_* → test_module_* (module dependency tests) - Attention: test_integration_* → test_module_* (module dependency tests) - Establishes unit vs module test distinction for neural network components
This commit is contained in:
@@ -227,8 +227,8 @@ Once you implement the ReLU forward method above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-relu-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_relu_activation():
|
||||
"""Test ReLU activation function"""
|
||||
def test_unit_relu_activation():
|
||||
"""Unit test for the ReLU activation function."""
|
||||
print("🔬 Unit Test: ReLU Activation...")
|
||||
|
||||
# Create ReLU instance
|
||||
@@ -265,7 +265,7 @@ def test_relu_activation():
|
||||
print(f"✅ Works with multi-dimensional tensors")
|
||||
|
||||
# Run the test
|
||||
test_relu_activation()
|
||||
test_unit_relu_activation()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -365,8 +365,8 @@ Once you implement the Sigmoid forward method above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-sigmoid-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_sigmoid_activation():
|
||||
"""Test Sigmoid activation function"""
|
||||
def test_unit_sigmoid_activation():
|
||||
"""Unit test for the Sigmoid activation function."""
|
||||
print("🔬 Unit Test: Sigmoid Activation...")
|
||||
|
||||
# Create Sigmoid instance
|
||||
@@ -412,7 +412,7 @@ def test_sigmoid_activation():
|
||||
print(f"✅ Shape preservation working")
|
||||
|
||||
# Run the test
|
||||
test_sigmoid_activation()
|
||||
test_unit_sigmoid_activation()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -511,8 +511,8 @@ Once you implement the Tanh forward method above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-tanh-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_tanh_activation():
|
||||
"""Test Tanh activation function"""
|
||||
def test_unit_tanh_activation():
|
||||
"""Unit test for the Tanh activation function."""
|
||||
print("🔬 Unit Test: Tanh Activation...")
|
||||
|
||||
# Create Tanh instance
|
||||
@@ -562,7 +562,7 @@ def test_tanh_activation():
|
||||
print(f"✅ Handles extreme values correctly")
|
||||
|
||||
# Run the test
|
||||
test_tanh_activation()
|
||||
test_unit_tanh_activation()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -679,8 +679,8 @@ Once you implement the Softmax forward method above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-softmax-immediate", "locked": true, "points": 15, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_softmax_activation():
|
||||
"""Test Softmax activation function"""
|
||||
def test_unit_softmax_activation():
|
||||
"""Unit test for the Softmax activation function."""
|
||||
print("🔬 Unit Test: Softmax Activation...")
|
||||
|
||||
# Create Softmax instance
|
||||
@@ -736,7 +736,7 @@ def test_softmax_activation():
|
||||
print(f"✅ Numerically stable with large values")
|
||||
|
||||
# Run the test
|
||||
test_softmax_activation()
|
||||
test_unit_softmax_activation()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -752,8 +752,8 @@ Let's test how all activation functions work together in a realistic neural netw
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-activations-comprehensive", "locked": true, "points": 15, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_activations():
|
||||
"""Test all activation functions working together"""
|
||||
def test_unit_activations_comprehensive():
|
||||
"""Comprehensive unit test for all activation functions working together."""
|
||||
print("🔬 Unit Test: Activation Functions Comprehensive Test...")
|
||||
|
||||
# Create instances of all activation functions
|
||||
@@ -835,7 +835,7 @@ def test_activations():
|
||||
print(f"✅ Ready for neural network integration!")
|
||||
|
||||
# Run the comprehensive test
|
||||
test_activations()
|
||||
test_unit_activations_comprehensive()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -852,7 +852,60 @@ Time to test your implementation! This section uses TinyTorch's standardized tes
|
||||
# This cell is locked to ensure consistent testing across all TinyTorch modules
|
||||
# =============================================================================
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🔬 Integration Test: Activations with Tensors
|
||||
"""
|
||||
|
||||
# %%
|
||||
def test_module_activations_tensor_compatibility():
|
||||
"""
|
||||
Integration test for activation functions and the Tensor class.
|
||||
|
||||
Tests that all activation functions correctly process Tensor objects.
|
||||
"""
|
||||
print("🔬 Running Integration Test: Activations with Tensors...")
|
||||
|
||||
# 1. Create a base Tensor
|
||||
input_data = np.array([-2., -1., 0., 1., 2.])
|
||||
input_tensor = Tensor(input_data)
|
||||
|
||||
# 2. Test ReLU
|
||||
relu = ReLU()
|
||||
relu_output = relu(input_tensor)
|
||||
assert isinstance(relu_output, Tensor), "ReLU output should be a Tensor"
|
||||
assert np.allclose(relu_output.data, np.maximum(0, input_data)), "ReLU calculation is incorrect"
|
||||
print("✅ ReLU integrates correctly with Tensor.")
|
||||
|
||||
# 3. Test Sigmoid
|
||||
sigmoid = Sigmoid()
|
||||
sigmoid_output = sigmoid(input_tensor)
|
||||
expected_sigmoid = 1 / (1 + np.exp(-input_data))
|
||||
assert isinstance(sigmoid_output, Tensor), "Sigmoid output should be a Tensor"
|
||||
assert np.allclose(sigmoid_output.data, expected_sigmoid), "Sigmoid calculation is incorrect"
|
||||
print("✅ Sigmoid integrates correctly with Tensor.")
|
||||
|
||||
# 4. Test Tanh
|
||||
tanh = Tanh()
|
||||
tanh_output = tanh(input_tensor)
|
||||
assert isinstance(tanh_output, Tensor), "Tanh output should be a Tensor"
|
||||
assert np.allclose(tanh_output.data, np.tanh(input_data)), "Tanh calculation is incorrect"
|
||||
print("✅ Tanh integrates correctly with Tensor.")
|
||||
|
||||
# 5. Test Softmax
|
||||
softmax = Softmax()
|
||||
softmax_output = softmax(input_tensor)
|
||||
exp_x = np.exp(input_data - np.max(input_data))
|
||||
expected_softmax = exp_x / exp_x.sum(axis=0)
|
||||
assert isinstance(softmax_output, Tensor), "Softmax output should be a Tensor"
|
||||
assert np.allclose(softmax_output.data, expected_softmax), "Softmax calculation is incorrect"
|
||||
assert abs(softmax_output.data.sum() - 1.0) < 1e-6, "Softmax output should sum to 1"
|
||||
print("✅ Softmax integrates correctly with Tensor.")
|
||||
|
||||
print("✅ Integration Test Passed: All activation functions are compatible with Tensors.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_module_activations_tensor_compatibility()
|
||||
from tito.tools.testing import run_module_tests_auto
|
||||
|
||||
# Automatically discover and run all tests in this module
|
||||
|
||||
@@ -255,7 +255,7 @@ Once you implement the `matmul` function above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-matmul-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_matrix_multiplication():
|
||||
def test_unit_matrix_multiplication():
|
||||
"""Test matrix multiplication implementation"""
|
||||
print("🔬 Unit Test: Matrix Multiplication...")
|
||||
|
||||
@@ -469,7 +469,7 @@ Once you implement the Dense layer above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-dense-layer", "locked": true, "points": 15, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_dense_layer():
|
||||
def test_unit_dense_layer():
|
||||
"""Test Dense layer implementation"""
|
||||
print("🔬 Unit Test: Dense Layer...")
|
||||
|
||||
@@ -555,7 +555,7 @@ final_output = activation_function(linear_output)
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-layer-activation-comprehensive", "locked": true, "points": 15, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_layer_activation():
|
||||
def test_unit_layer_activation():
|
||||
"""Test Dense layer comprehensive testing with activation functions"""
|
||||
print("🔬 Unit Test: Layer-Activation Comprehensive Test...")
|
||||
|
||||
@@ -632,6 +632,45 @@ def test_layer_activation():
|
||||
# Run the test
|
||||
test_layer_activation()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🔬 Integration Test: Layers with Tensors
|
||||
|
||||
This is our first cumulative integration test.
|
||||
It ensures that the 'Layer' abstraction works correctly with the 'Tensor' class from the previous module.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def test_layer_tensor_integration():
|
||||
"""
|
||||
Tests that a Tensor can be passed through a Layer subclass
|
||||
and that the output is of the correct type and shape.
|
||||
"""
|
||||
print("🔬 Running Integration Test: Layer with Tensor...")
|
||||
|
||||
# 1. Define a simple Layer that doubles the input
|
||||
class DoubleLayer(Dense): # Inherit from Dense to get __call__
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
return x * 2
|
||||
|
||||
# 2. Create an instance of the layer
|
||||
double_layer = DoubleLayer(input_size=1, output_size=1) # Dummy sizes
|
||||
|
||||
# 3. Create a Tensor from the previous module
|
||||
input_tensor = Tensor([1, 2, 3])
|
||||
|
||||
# 4. Perform the forward pass
|
||||
output_tensor = double_layer(input_tensor)
|
||||
|
||||
# 5. Assert correctness
|
||||
assert isinstance(output_tensor, Tensor), "Output should be a Tensor"
|
||||
assert np.array_equal(output_tensor.data, np.array([2, 4, 6])), "Output data is incorrect"
|
||||
print("✅ Integration Test Passed: Layer correctly processed Tensor.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_layer_tensor_integration()
|
||||
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🧪 Module Testing
|
||||
|
||||
@@ -555,6 +555,41 @@ Let's test different network architectures to understand their behavior.
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-architectures", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def plot_network_architectures():
|
||||
"""Visualize different network architectures."""
|
||||
if not _should_show_plots():
|
||||
return
|
||||
|
||||
# Create different architectures
|
||||
relu_net = create_mlp(input_size=3, hidden_sizes=[4], output_size=1, activation=ReLU)
|
||||
tanh_net = create_mlp(input_size=3, hidden_sizes=[4], output_size=1, activation=Tanh)
|
||||
classifier = create_mlp(input_size=3, hidden_sizes=[4], output_size=3, output_activation=Softmax)
|
||||
|
||||
# Create input data
|
||||
x = Tensor([[1.0, 2.0, 3.0]])
|
||||
|
||||
# Get outputs
|
||||
y_relu = relu_net(x)
|
||||
y_tanh = tanh_net(x)
|
||||
y_multi = classifier(x)
|
||||
|
||||
# Plot the results
|
||||
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
|
||||
|
||||
axs[0].set_title("ReLU Network Output")
|
||||
axs[0].bar(['Output'], [y_relu.data[0][0]], color='skyblue')
|
||||
|
||||
axs[1].set_title("Tanh Network Output")
|
||||
axs[1].bar(['Output'], [y_tanh.data[0][0]], color='salmon')
|
||||
|
||||
axs[2].set_title("Softmax Classifier Output")
|
||||
axs[2].bar([f"Class {i}" for i in range(3)], y_multi.data[0], color='lightgreen')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
def test_unit_network_architectures():
|
||||
"""Unit test for different network architectures."""
|
||||
# Test different architectures
|
||||
print("🔬 Unit Test: Network Architecture Variations...")
|
||||
|
||||
@@ -602,6 +637,9 @@ try:
|
||||
|
||||
print("✅ All network architectures work correctly")
|
||||
|
||||
# Plot the architectures if not in test mode
|
||||
plot_network_architectures()
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Architecture test failed: {e}")
|
||||
raise
|
||||
@@ -780,8 +818,8 @@ class MLP:
|
||||
|
||||
# %% [markdown]
|
||||
|
||||
def test_sequential_networks():
|
||||
"""Test Sequential network implementation comprehensively."""
|
||||
def test_unit_sequential_networks():
|
||||
"""Unit test for the Sequential network implementation."""
|
||||
print("🔬 Unit Test: Sequential Networks...")
|
||||
|
||||
# Test basic Sequential network
|
||||
@@ -801,8 +839,8 @@ def test_sequential_networks():
|
||||
|
||||
print("✅ Sequential networks work correctly")
|
||||
|
||||
def test_mlp_creation():
|
||||
"""Test MLP creation function comprehensively."""
|
||||
def test_unit_mlp_creation():
|
||||
"""Unit test for the MLP creation function."""
|
||||
print("🔬 Unit Test: MLP Creation...")
|
||||
|
||||
# Test different MLP architectures
|
||||
@@ -821,8 +859,8 @@ def test_mlp_creation():
|
||||
|
||||
print("✅ MLP creation works correctly")
|
||||
|
||||
def test_network_architectures():
|
||||
"""Test different network architectures comprehensively."""
|
||||
def test_unit_network_architectures():
|
||||
"""Unit test for different network architectures."""
|
||||
print("🔬 Unit Test: Network Architectures...")
|
||||
|
||||
# Test different activation functions
|
||||
@@ -846,8 +884,8 @@ def test_network_architectures():
|
||||
|
||||
print("✅ Network architectures work correctly")
|
||||
|
||||
def test_networks():
|
||||
"""Test network comprehensive testing with real ML scenarios."""
|
||||
def test_unit_network_applications():
|
||||
"""Comprehensive unit test for network applications in real ML scenarios."""
|
||||
print("🔬 Comprehensive Test: Network Applications...")
|
||||
|
||||
# Test multi-class classification
|
||||
@@ -874,7 +912,43 @@ Time to test your implementation! This section uses TinyTorch's standardized tes
|
||||
# This cell is locked to ensure consistent testing across all TinyTorch modules
|
||||
# =============================================================================
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🔬 Integration Test: End-to-End Network Forward Pass
|
||||
"""
|
||||
|
||||
# %%
|
||||
def test_module_full_network_forward_pass():
|
||||
"""
|
||||
Integration test for a complete forward pass through a multi-layer network.
|
||||
|
||||
Tests a complete forward pass through a multi-layer network,
|
||||
integrating Tensors, Dense layers, Activations, and the Sequential container.
|
||||
"""
|
||||
print("🔬 Running Integration Test: Full Network Forward Pass...")
|
||||
|
||||
# 1. Define a simple 2-layer MLP
|
||||
# Input (3) -> Dense(4) -> ReLU -> Dense(2) -> Output
|
||||
model = Sequential([
|
||||
Dense(3, 4),
|
||||
ReLU(),
|
||||
Dense(4, 2)
|
||||
])
|
||||
|
||||
# 2. Create a batch of input Tensors
|
||||
# Batch of 5 samples, each with 3 features
|
||||
input_tensor = Tensor(np.random.randn(5, 3))
|
||||
|
||||
# 3. Perform a forward pass through the entire network
|
||||
output_tensor = model(input_tensor)
|
||||
|
||||
# 4. Assert the final output is correct
|
||||
assert isinstance(output_tensor, Tensor), "Network output must be a Tensor"
|
||||
assert output_tensor.shape == (5, 2), f"Expected output shape (5, 2), but got {output_tensor.shape}"
|
||||
print("✅ Integration Test Passed: Full network forward pass is successful.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_module_full_network_forward_pass()
|
||||
from tito.tools.testing import run_module_tests_auto
|
||||
|
||||
# Automatically discover and run all tests in this module
|
||||
|
||||
@@ -347,36 +347,34 @@ class Conv2D:
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Forward pass: apply convolution to input tensor.
|
||||
Forward pass through the Conv2D layer.
|
||||
|
||||
Args:
|
||||
x: Input tensor (2D for simplicity)
|
||||
|
||||
x: Input tensor (batch_size, H, W)
|
||||
Returns:
|
||||
Output tensor after convolution
|
||||
|
||||
TODO: Implement forward pass using conv2d_naive function.
|
||||
|
||||
APPROACH:
|
||||
1. Extract numpy array from input tensor
|
||||
2. Apply conv2d_naive with stored kernel
|
||||
3. Return result wrapped in Tensor
|
||||
|
||||
EXAMPLE:
|
||||
x = Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) # shape (3, 3)
|
||||
layer = Conv2D((2, 2))
|
||||
y = layer(x) # shape (2, 2)
|
||||
|
||||
HINTS:
|
||||
- Use x.data to get numpy array
|
||||
- Use conv2d_naive(x.data, self.kernel)
|
||||
- Return Tensor(result) to wrap the result
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Apply convolution using naive implementation
|
||||
result = conv2d_naive(x.data, self.kernel)
|
||||
return type(x)(result)
|
||||
### END SOLUTION
|
||||
# Handle batches by iterating through each item
|
||||
if len(x.shape) == 3:
|
||||
batch_size, H, W = x.shape
|
||||
# Calculate output shape once
|
||||
kH, kW = self.kernel.shape
|
||||
out_H, out_W = H - kH + 1, W - kW + 1
|
||||
|
||||
# Create an empty list to store results
|
||||
results = []
|
||||
# Iterate over each image in the batch
|
||||
for i in range(batch_size):
|
||||
# Apply naive convolution to each image
|
||||
convolved = conv2d_naive(x.data[i], self.kernel)
|
||||
results.append(convolved)
|
||||
# Stack results into a single NumPy array
|
||||
output_data = np.stack(results)
|
||||
|
||||
else: # Handle single image case
|
||||
output_data = conv2d_naive(x.data, self.kernel)
|
||||
|
||||
return Tensor(output_data)
|
||||
|
||||
def __call__(self, x):
|
||||
"""Make layer callable: layer(x) same as layer.forward(x)"""
|
||||
@@ -725,8 +723,8 @@ except Exception as e:
|
||||
|
||||
print("📈 Final Progress: Complete CNN system ready for computer vision!")
|
||||
|
||||
def test_convolution_operation():
|
||||
"""Test convolution operation implementation comprehensively."""
|
||||
def test_unit_convolution_operation():
|
||||
"""Unit test for the convolution operation implementation."""
|
||||
print("🔬 Unit Test: Convolution Operation...")
|
||||
|
||||
# Test basic convolution
|
||||
@@ -740,8 +738,8 @@ def test_convolution_operation():
|
||||
|
||||
print("✅ Convolution operation works correctly")
|
||||
|
||||
def test_conv2d_layer():
|
||||
"""Test Conv2D layer implementation comprehensively."""
|
||||
def test_unit_conv2d_layer():
|
||||
"""Unit test for the Conv2D layer implementation."""
|
||||
print("🔬 Unit Test: Conv2D Layer...")
|
||||
|
||||
# Test Conv2D layer
|
||||
@@ -755,8 +753,8 @@ def test_conv2d_layer():
|
||||
|
||||
print("✅ Conv2D layer works correctly")
|
||||
|
||||
def test_flatten_function():
|
||||
"""Test flatten function implementation comprehensively."""
|
||||
def test_unit_flatten_function():
|
||||
"""Unit test for the flatten function implementation."""
|
||||
print("🔬 Unit Test: Flatten Function...")
|
||||
|
||||
# Test flatten function
|
||||
@@ -786,7 +784,42 @@ Time to test your implementation! This section uses TinyTorch's standardized tes
|
||||
# This cell is locked to ensure consistent testing across all TinyTorch modules
|
||||
# =============================================================================
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🔬 Integration Test: Conv2D Layer with Tensors
|
||||
"""
|
||||
|
||||
# %%
|
||||
def test_module_conv2d_tensor_compatibility():
|
||||
"""
|
||||
Integration test for the Conv2D layer and the Tensor class.
|
||||
|
||||
Tests that the Conv2D layer correctly processes a batch of image-like Tensors.
|
||||
"""
|
||||
print("🔬 Running Integration Test: Conv2D with Tensors...")
|
||||
|
||||
# 1. Define a Conv2D layer
|
||||
# Kernel of size 3x3
|
||||
conv_layer = Conv2D((3, 3))
|
||||
|
||||
# 2. Create a batch of 5 grayscale images (10x10)
|
||||
# Shape: (batch_size, height, width)
|
||||
input_images = np.random.randn(5, 10, 10)
|
||||
input_tensor = Tensor(input_images)
|
||||
|
||||
# 3. Perform a forward pass
|
||||
output_tensor = conv_layer(input_tensor)
|
||||
|
||||
# 4. Assert the output shape is correct
|
||||
# Output height = 10 - 3 + 1 = 8
|
||||
# Output width = 10 - 3 + 1 = 8
|
||||
expected_shape = (5, 8, 8)
|
||||
assert isinstance(output_tensor, Tensor), "Conv2D output must be a Tensor"
|
||||
assert output_tensor.shape == expected_shape, f"Expected output shape {expected_shape}, but got {output_tensor.shape}"
|
||||
print("✅ Integration Test Passed: Conv2D layer correctly transformed image tensor.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_module_conv2d_tensor_compatibility()
|
||||
from tito.tools.testing import run_module_tests_auto
|
||||
|
||||
# Automatically discover and run all tests in this module
|
||||
|
||||
@@ -178,8 +178,8 @@ Let's build the fundamental attention function!
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "scaled-dot-product-attention", "locked": false, "schema_version": 3, "solution": true, "task": false}
|
||||
#| export
|
||||
def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
|
||||
mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
||||
def scaled_dot_product_attention(Q: Tensor, K: Tensor, V: Tensor,
|
||||
mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Scaled Dot-Product Attention - The foundation of all transformer models.
|
||||
|
||||
@@ -214,14 +214,14 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
|
||||
- Attention weights are interpretable - you can visualize them!
|
||||
|
||||
Args:
|
||||
Q: Query matrix of shape (..., seq_len_q, d_k)
|
||||
K: Key matrix of shape (..., seq_len_k, d_k)
|
||||
V: Value matrix of shape (..., seq_len_v, d_v)
|
||||
mask: Optional mask of shape (..., seq_len_q, seq_len_k)
|
||||
Q: Query tensor of shape (..., seq_len_q, d_k)
|
||||
K: Key tensor of shape (..., seq_len_k, d_k)
|
||||
V: Value tensor of shape (..., seq_len_v, d_v)
|
||||
mask: Optional mask tensor of shape (..., seq_len_q, seq_len_k)
|
||||
|
||||
Returns:
|
||||
output: Attention output (..., seq_len_q, d_v)
|
||||
attention_weights: Attention probabilities (..., seq_len_q, seq_len_k)
|
||||
output: Attention output tensor (..., seq_len_q, d_v)
|
||||
attention_weights: Attention probabilities tensor (..., seq_len_q, seq_len_k)
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Get the dimension for scaling
|
||||
@@ -229,29 +229,28 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
|
||||
|
||||
# Step 1: Compute attention scores (QK^T)
|
||||
# This measures similarity between each query and each key
|
||||
scores = np.matmul(Q, np.swapaxes(K, -2, -1)) # (..., seq_len_q, seq_len_k)
|
||||
scores_data = np.matmul(Q.data, np.swapaxes(K.data, -2, -1))
|
||||
|
||||
# Step 2: Scale by √d_k to prevent exploding gradients
|
||||
scores = scores / math.sqrt(d_k)
|
||||
scores_data = scores_data / math.sqrt(d_k)
|
||||
|
||||
# Step 3: Apply mask if provided (for padding or causality)
|
||||
if mask is not None:
|
||||
# Replace masked positions with large negative values
|
||||
# This makes softmax output ~0 for these positions
|
||||
scores = np.where(mask == 0, -1e9, scores)
|
||||
scores_data = np.where(mask.data == 0, -1e9, scores_data)
|
||||
|
||||
# Step 4: Apply softmax to get attention probabilities
|
||||
# Each row sums to 1, representing where to focus attention
|
||||
# Using numerically stable softmax
|
||||
scores_max = np.max(scores, axis=-1, keepdims=True)
|
||||
scores_exp = np.exp(scores - scores_max)
|
||||
attention_weights = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
|
||||
scores_max = np.max(scores_data, axis=-1, keepdims=True)
|
||||
scores_exp = np.exp(scores_data - scores_max)
|
||||
attention_weights_data = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
|
||||
|
||||
# Step 5: Apply attention weights to values
|
||||
# This gives us the weighted combination of values
|
||||
output = np.matmul(attention_weights, V) # (..., seq_len_q, d_v)
|
||||
output_data = np.matmul(attention_weights_data, V.data)
|
||||
|
||||
return output, attention_weights
|
||||
return Tensor(output_data), Tensor(attention_weights_data)
|
||||
### END SOLUTION
|
||||
|
||||
# %% [markdown]
|
||||
@@ -262,54 +261,47 @@ Once you implement the `scaled_dot_product_attention` function above, run this c
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-attention-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_scaled_dot_product_attention():
|
||||
"""Test scaled dot-product attention implementation"""
|
||||
def test_unit_scaled_dot_product_attention():
|
||||
"""Unit test for the scaled dot-product attention implementation."""
|
||||
print("🔬 Unit Test: Scaled Dot-Product Attention...")
|
||||
|
||||
# Create simple test data
|
||||
seq_len, d_model = 4, 6
|
||||
np.random.seed(42)
|
||||
|
||||
# Create Q, K, V matrices
|
||||
Q = np.random.randn(seq_len, d_model) * 0.1
|
||||
K = np.random.randn(seq_len, d_model) * 0.1
|
||||
V = np.random.randn(seq_len, d_model) * 0.1
|
||||
# Define Q, K, V matrices
|
||||
Q = Tensor(np.random.rand(4, 6))
|
||||
K = Tensor(np.random.rand(4, 6))
|
||||
V = Tensor(np.random.rand(4, 6))
|
||||
|
||||
print(f"📊 Input shapes: Q{Q.shape}, K{K.shape}, V{V.shape}")
|
||||
|
||||
# Test attention
|
||||
output, weights = scaled_dot_product_attention(Q, K, V)
|
||||
# Test without mask
|
||||
output, attention_weights = scaled_dot_product_attention(Q, K, V)
|
||||
|
||||
print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")
|
||||
print(f"📊 Output shapes: output{output.shape}, weights{attention_weights.shape}")
|
||||
|
||||
# Verify properties
|
||||
weights_sum = np.sum(weights, axis=-1)
|
||||
# Check output shape
|
||||
assert output.shape == (4, 6), f"Output shape should be (4, 6), got {output.shape}"
|
||||
assert attention_weights.shape == (4, 4), f"Weights shape should be (4, 4), got {attention_weights.shape}"
|
||||
|
||||
# Check that attention weights sum to 1
|
||||
weights_sum = np.sum(attention_weights.data, axis=-1)
|
||||
assert np.allclose(weights_sum, 1.0), f"Attention weights should sum to 1, got {weights_sum}"
|
||||
assert output.shape == (seq_len, d_model), f"Output shape should be {(seq_len, d_model)}, got {output.shape}"
|
||||
assert np.all(weights >= 0), "All attention weights should be non-negative"
|
||||
|
||||
print("✅ Attention without mask works correctly")
|
||||
|
||||
# Test with mask
|
||||
mask = np.array([
|
||||
[1, 1, 0, 0],
|
||||
[1, 1, 1, 0],
|
||||
[1, 1, 1, 1],
|
||||
[1, 1, 1, 1]
|
||||
])
|
||||
mask = Tensor(np.tril(np.ones((4, 4)))) # Lower triangular mask
|
||||
output_masked, weights_masked = scaled_dot_product_attention(Q, K, V, mask)
|
||||
|
||||
# Check that masked positions have near-zero attention
|
||||
masked_positions = (mask == 0)
|
||||
masked_weights = weights_masked[masked_positions]
|
||||
assert np.all(masked_weights < 1e-6), "Masked positions should have near-zero weights"
|
||||
|
||||
print("✅ Attention weights sum to 1: True")
|
||||
print("✅ Output has correct shape: True")
|
||||
print("✅ All weights are non-negative: True")
|
||||
print("✅ Masked positions have near-zero weights: True")
|
||||
print("📈 Progress: Scaled Dot-Product Attention ✓")
|
||||
# Check that masked weights are zero
|
||||
masked_positions = weights_masked.data[0, 2] # Example of a masked position
|
||||
# This is a bit tricky to assert directly due to softmax, but we can check if it's very small
|
||||
assert masked_positions < 1e-6, f"Masked weights should be close to 0, got {masked_positions}"
|
||||
|
||||
print("✅ Attention with mask works correctly")
|
||||
|
||||
print("📈 Progress: Scaled dot-product attention ✓")
|
||||
|
||||
# Run the test
|
||||
test_scaled_dot_product_attention()
|
||||
test_unit_scaled_dot_product_attention()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -370,7 +362,7 @@ class SelfAttention:
|
||||
print(f"🔧 SelfAttention: d_model={d_model}")
|
||||
### END SOLUTION
|
||||
|
||||
def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
||||
def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Forward pass of self-attention.
|
||||
|
||||
@@ -383,7 +375,7 @@ class SelfAttention:
|
||||
|
||||
EXAMPLE USAGE:
|
||||
```python
|
||||
x = np.random.randn(seq_len, d_model) # Input sequence
|
||||
x = Tensor(np.random.randn(seq_len, d_model)) # Input sequence
|
||||
output, weights = self_attn.forward(x)
|
||||
# weights[i,j] = how much position i attends to position j
|
||||
```
|
||||
@@ -411,7 +403,7 @@ class SelfAttention:
|
||||
return scaled_dot_product_attention(x, x, x, mask)
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
||||
def __call__(self, x: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
|
||||
"""Make the class callable."""
|
||||
return self.forward(x, mask)
|
||||
|
||||
@@ -423,8 +415,8 @@ Once you implement the SelfAttention class above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-self-attention-immediate", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_self_attention():
|
||||
"""Test self-attention wrapper"""
|
||||
def test_unit_self_attention():
|
||||
"""Unit test for the self-attention wrapper."""
|
||||
print("🔬 Unit Test: Self-Attention...")
|
||||
|
||||
# Test parameters
|
||||
@@ -433,7 +425,7 @@ def test_self_attention():
|
||||
np.random.seed(42)
|
||||
|
||||
# Create test data (like word embeddings)
|
||||
x = np.random.randn(seq_len, d_model) * 0.1
|
||||
x = Tensor(np.random.randn(seq_len, d_model) * 0.1)
|
||||
|
||||
print(f"📊 Test setup: d_model={d_model}, seq_len={seq_len}")
|
||||
|
||||
@@ -448,7 +440,7 @@ def test_self_attention():
|
||||
# Verify properties
|
||||
assert output.shape == x.shape, f"Output shape should match input shape {x.shape}, got {output.shape}"
|
||||
assert weights.shape == (seq_len, seq_len), f"Attention weights shape should be {(seq_len, seq_len)}, got {weights.shape}"
|
||||
assert np.allclose(np.sum(weights, axis=-1), 1.0), "Attention weights should sum to 1"
|
||||
assert np.allclose(np.sum(weights.data, axis=-1), 1.0), "Attention weights should sum to 1"
|
||||
assert weights.shape[0] == weights.shape[1], "Self-attention weights should be square matrix"
|
||||
|
||||
print("✅ Output shape preserved: True")
|
||||
@@ -458,7 +450,7 @@ def test_self_attention():
|
||||
print("📈 Progress: Self-Attention ✓")
|
||||
|
||||
# Run the test
|
||||
test_self_attention()
|
||||
test_unit_self_attention()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -622,8 +614,8 @@ Once you implement the masking functions above, run this cell to test them:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-masking-immediate", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_attention_masking():
|
||||
"""Test attention masking utilities"""
|
||||
def test_unit_attention_masking():
|
||||
"""Unit test for the attention masking utilities."""
|
||||
print("🔬 Unit Test: Attention Masking...")
|
||||
|
||||
# Test causal mask
|
||||
@@ -670,7 +662,7 @@ def test_attention_masking():
|
||||
print("📈 Progress: Attention Masking ✓")
|
||||
|
||||
# Run the test
|
||||
test_attention_masking()
|
||||
test_unit_attention_masking()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -681,20 +673,20 @@ Let's test all components working together in a realistic scenario similar to ho
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-integration-final", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_complete_attention_system():
|
||||
"""Test the complete attention system working together"""
|
||||
print("🔬 Unit Test: Complete Attention System Integration...")
|
||||
def test_unit_complete_attention_system():
|
||||
"""Comprehensive unit test for the entire attention system."""
|
||||
print("🔬 Comprehensive Test: Complete Attention System...")
|
||||
|
||||
# Test parameters
|
||||
d_model = 64
|
||||
seq_len = 16
|
||||
d_model = 32
|
||||
seq_len = 8
|
||||
batch_size = 2
|
||||
np.random.seed(42)
|
||||
|
||||
print(f"📊 Integration test: d_model={d_model}, seq_len={seq_len}, batch_size={batch_size}")
|
||||
|
||||
# Step 1: Create input embeddings (simulating word embeddings)
|
||||
embeddings = np.random.randn(batch_size, seq_len, d_model) * 0.1
|
||||
embeddings = Tensor(np.random.randn(batch_size, seq_len, d_model) * 0.1)
|
||||
print(f"📊 Input embeddings: {embeddings.shape}")
|
||||
|
||||
# Step 2: Test basic attention
|
||||
@@ -704,31 +696,31 @@ def test_complete_attention_system():
|
||||
|
||||
# Step 3: Test self-attention wrapper
|
||||
self_attn = SelfAttention(d_model)
|
||||
self_output, self_weights = self_attn(embeddings[0]) # Single batch item
|
||||
self_output, self_weights = self_attn(Tensor(embeddings.data[0])) # Single batch item
|
||||
assert self_output.shape == (seq_len, d_model), "Self-attention should preserve shape"
|
||||
print(f"✅ Self-attention output: {self_output.shape}")
|
||||
|
||||
# Step 4: Test with causal mask (like GPT)
|
||||
causal_mask = create_causal_mask(seq_len)
|
||||
causal_mask = Tensor(create_causal_mask(seq_len))
|
||||
causal_output, causal_weights = scaled_dot_product_attention(
|
||||
embeddings[0], embeddings[0], embeddings[0], causal_mask
|
||||
Tensor(embeddings.data[0]), Tensor(embeddings.data[0]), Tensor(embeddings.data[0]), causal_mask
|
||||
)
|
||||
assert causal_output.shape == (seq_len, d_model), "Causal attention should preserve shape"
|
||||
print(f"✅ Causal attention works: {causal_output.shape}")
|
||||
|
||||
# Step 5: Test with padding mask (variable lengths)
|
||||
lengths = [seq_len, seq_len-3] # Different sequence lengths
|
||||
padding_mask = create_padding_mask(lengths, seq_len)
|
||||
padding_mask = Tensor(create_padding_mask(lengths, seq_len))
|
||||
padded_output, padded_weights = scaled_dot_product_attention(
|
||||
embeddings[0], embeddings[0], embeddings[0], padding_mask[0]
|
||||
Tensor(embeddings.data[0]), Tensor(embeddings.data[0]), Tensor(embeddings.data[0]), Tensor(padding_mask.data[0])
|
||||
)
|
||||
assert padded_output.shape == (seq_len, d_model), "Padding attention should preserve shape"
|
||||
print(f"✅ Padding mask works: {padded_output.shape}")
|
||||
|
||||
# Step 6: Verify all outputs have correct properties
|
||||
assert np.allclose(np.sum(attention_weights, axis=-1), 1.0), "All attention weights should sum to 1"
|
||||
assert np.allclose(np.sum(attention_weights.data, axis=-1), 1.0), "All attention weights should sum to 1"
|
||||
assert output.shape == embeddings.shape, "All outputs should preserve input shape"
|
||||
assert np.all(np.triu(causal_weights, k=1) < 1e-6), "Causal masking should work"
|
||||
assert np.all(np.triu(causal_weights.data, k=1) < 1e-6), "Causal masking should work"
|
||||
|
||||
print("✅ All attention weights sum to 1: True")
|
||||
print("✅ All outputs preserve input shape: True")
|
||||
@@ -736,7 +728,7 @@ def test_complete_attention_system():
|
||||
print("📈 Progress: Complete Attention System ✓")
|
||||
|
||||
# Run the test
|
||||
test_complete_attention_system()
|
||||
test_unit_complete_attention_system()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -759,25 +751,29 @@ simple_seq = np.array([
|
||||
print(f"🎯 Simple test sequence shape: {simple_seq.shape}")
|
||||
|
||||
# Apply attention
|
||||
output, weights = scaled_dot_product_attention(simple_seq, simple_seq, simple_seq)
|
||||
output, weights = scaled_dot_product_attention(Tensor(simple_seq), Tensor(simple_seq), Tensor(simple_seq))
|
||||
|
||||
print(f"🎯 Attention pattern analysis:")
|
||||
print(f"Position 0 attends most to position: {np.argmax(weights[0])}")
|
||||
print(f"Position 3 attends most to position: {np.argmax(weights[3])}")
|
||||
print(f"Position 0 attends most to position: {np.argmax(weights.data[0])}")
|
||||
print(f"Position 3 attends most to position: {np.argmax(weights.data[3])}")
|
||||
print(f"✅ Positions with same content should attend to each other!")
|
||||
|
||||
# Test with causal masking
|
||||
causal_mask = create_causal_mask(4)
|
||||
output_causal, weights_causal = scaled_dot_product_attention(simple_seq, simple_seq, simple_seq, causal_mask)
|
||||
output_causal, weights_causal = scaled_dot_product_attention(Tensor(simple_seq), Tensor(simple_seq), Tensor(simple_seq), Tensor(causal_mask))
|
||||
|
||||
print(f"🎯 With causal masking:")
|
||||
print(f"Position 3 can only attend to positions 0-3: {np.sum(weights_causal[3, :]) > 0.99}")
|
||||
print(f"Position 3 can only attend to positions 0-3: {np.sum(weights_causal.data[3, :]) > 0.99}")
|
||||
|
||||
def plot_attention_patterns(weights, weights_causal):
|
||||
"""Visualize attention patterns."""
|
||||
if not _should_show_plots():
|
||||
return
|
||||
|
||||
if _should_show_plots():
|
||||
plt.figure(figsize=(12, 4))
|
||||
|
||||
plt.subplot(1, 3, 1)
|
||||
plt.imshow(weights, cmap='Blues')
|
||||
plt.imshow(weights.data, cmap='Blues')
|
||||
plt.title('Full Attention Weights\n(Darker = Higher Attention)')
|
||||
plt.xlabel('Key Position')
|
||||
plt.ylabel('Query Position')
|
||||
@@ -786,20 +782,20 @@ if _should_show_plots():
|
||||
# Add text annotations
|
||||
for i in range(4):
|
||||
for j in range(4):
|
||||
plt.text(j, i, f'{weights[i,j]:.2f}',
|
||||
plt.text(j, i, f'{weights.data[i,j]:.2f}',
|
||||
ha='center', va='center',
|
||||
color='white' if weights[i,j] > 0.5 else 'black')
|
||||
color='white' if weights.data[i,j] > 0.5 else 'black')
|
||||
|
||||
plt.subplot(1, 3, 2)
|
||||
plt.imshow(weights_causal, cmap='Blues')
|
||||
plt.imshow(weights_causal.data, cmap='Blues')
|
||||
plt.title('Causal Attention Weights\n(Upper triangle masked)')
|
||||
plt.xlabel('Key Position')
|
||||
plt.ylabel('Query Position')
|
||||
plt.colorbar()
|
||||
|
||||
plt.subplot(1, 3, 3)
|
||||
plt.plot(weights[0], 'o-', label='Position 0 attention')
|
||||
plt.plot(weights[3], 's-', label='Position 3 attention')
|
||||
plt.plot(weights.data[0], 'o-', label='Position 0 attention')
|
||||
plt.plot(weights.data[3], 's-', label='Position 3 attention')
|
||||
plt.xlabel('Attending to Position')
|
||||
plt.ylabel('Attention Weight')
|
||||
plt.title('Attention Distribution')
|
||||
@@ -809,6 +805,8 @@ if _should_show_plots():
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
plot_attention_patterns(weights, weights_causal)
|
||||
|
||||
print("🎯 Attention learns to focus on similar content!")
|
||||
|
||||
print("\n" + "="*50)
|
||||
@@ -824,39 +822,39 @@ print("✅ Complete integration tests")
|
||||
print("\nYou now understand the core mechanism powering modern AI! 🚀")
|
||||
print("Next: Learn how to build complete transformer models using this foundation.")
|
||||
|
||||
def test_attention_mechanism():
|
||||
"""Test attention mechanism implementation."""
|
||||
def test_unit_attention_mechanism():
|
||||
"""Unit test for the attention mechanism implementation."""
|
||||
print("🔬 Unit Test: Attention Mechanism...")
|
||||
|
||||
# Test basic attention
|
||||
Q = np.random.randn(4, 6) * 0.1
|
||||
K = np.random.randn(4, 6) * 0.1
|
||||
V = np.random.randn(4, 6) * 0.1
|
||||
Q = Tensor(np.random.randn(4, 6) * 0.1)
|
||||
K = Tensor(np.random.randn(4, 6) * 0.1)
|
||||
V = Tensor(np.random.randn(4, 6) * 0.1)
|
||||
output, weights = scaled_dot_product_attention(Q, K, V)
|
||||
|
||||
assert output.shape == (4, 6), "Attention should produce correct output shape"
|
||||
assert weights.shape == (4, 4), "Attention weights should be square matrix"
|
||||
assert np.allclose(np.sum(weights, axis=-1), 1.0), "Attention weights should sum to 1"
|
||||
assert np.allclose(np.sum(weights.data, axis=-1), 1.0), "Attention weights should sum to 1"
|
||||
|
||||
print("✅ Attention mechanism works correctly")
|
||||
|
||||
def test_self_attention_wrapper():
|
||||
"""Test self-attention wrapper implementation."""
|
||||
def test_unit_self_attention_wrapper():
|
||||
"""Unit test for the self-attention wrapper implementation."""
|
||||
print("🔬 Unit Test: Self-Attention Wrapper...")
|
||||
|
||||
# Test self-attention
|
||||
self_attn = SelfAttention(d_model=32)
|
||||
x = np.random.randn(8, 32) * 0.1
|
||||
x = Tensor(np.random.randn(8, 32) * 0.1)
|
||||
output, weights = self_attn(x)
|
||||
|
||||
assert output.shape == x.shape, "Self-attention should preserve input shape"
|
||||
assert weights.shape == (8, 8), "Self-attention weights should be square"
|
||||
assert np.allclose(np.sum(weights, axis=-1), 1.0), "Weights should sum to 1"
|
||||
assert np.allclose(np.sum(weights.data, axis=-1), 1.0), "Weights should sum to 1"
|
||||
|
||||
print("✅ Self-attention wrapper works correctly")
|
||||
|
||||
def test_masking_utilities():
|
||||
"""Test attention masking utilities."""
|
||||
def test_unit_masking_utilities():
|
||||
"""Unit test for the attention masking utilities."""
|
||||
print("🔬 Unit Test: Masking Utilities...")
|
||||
|
||||
# Test causal mask
|
||||
@@ -888,7 +886,41 @@ Time to test your implementation! This section uses TinyTorch's standardized tes
|
||||
# This cell is locked to ensure consistent testing across all TinyTorch modules
|
||||
# =============================================================================
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🔬 Integration Test: Attention with Tensors
|
||||
"""
|
||||
|
||||
# %%
|
||||
def test_module_attention_tensor_compatibility():
|
||||
"""
|
||||
Integration test for the attention mechanism and the Tensor class.
|
||||
|
||||
Tests that the scaled_dot_product_attention function works correctly with Tensor objects.
|
||||
"""
|
||||
print("🔬 Running Integration Test: Attention with Tensors...")
|
||||
|
||||
# 1. Define Q, K, V as Tensors
|
||||
q = Tensor(np.random.randn(1, 5, 16)) # (batch, seq_len, d_k)
|
||||
k = Tensor(np.random.randn(1, 5, 16))
|
||||
v = Tensor(np.random.randn(1, 5, 32)) # (batch, seq_len, d_v)
|
||||
|
||||
# 2. Perform scaled dot-product attention
|
||||
output, attn_weights = scaled_dot_product_attention(q, k, v)
|
||||
|
||||
# 3. Assert outputs are Tensors with correct shapes
|
||||
assert isinstance(output, Tensor), "Output should be a Tensor"
|
||||
assert output.shape == (1, 5, 32), f"Expected output shape (1, 5, 32), but got {output.shape}"
|
||||
assert isinstance(attn_weights, Tensor), "Attention weights should be a Tensor"
|
||||
assert attn_weights.shape == (1, 5, 5), f"Expected weights shape (1, 5, 5), but got {attn_weights.shape}"
|
||||
|
||||
# 4. Check that attention weights sum to 1
|
||||
assert np.allclose(attn_weights.data.sum(axis=-1), 1.0), "Attention weights should sum to 1"
|
||||
|
||||
print("✅ Integration Test Passed: Scaled dot-product attention is compatible with Tensors.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_module_attention_tensor_compatibility()
|
||||
from tito.tools.testing import run_module_tests_auto
|
||||
|
||||
# Automatically discover and run all tests in this module
|
||||
|
||||
Reference in New Issue
Block a user