mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-27 04:27:32 -05:00
This commit implements the pedagogically optimal "inevitable discovery" module progression based on expert validation and educational design principles. ## Module Reordering Summary **Previous Order (Problems)**: - 05_losses → 06_autograd → 07_dataloader → 08_optimizers → 09_spatial → 10_training - Issues: Autograd before optimizers, DataLoader before training, scattered dependencies **New Order (Beautiful Progression)**: - 05_losses → 06_optimizers → 07_autograd → 08_training → 09_spatial → 10_dataloader - Benefits: Each module creates inevitable need for the next ## Pedagogical Flow Achieved **05_losses** → "Need systematic weight updates" → **06_optimizers** **06_optimizers** → "Need automatic gradients" → **07_autograd** **07_autograd** → "Need systematic training" → **08_training** **08_training** → "MLPs hit limits on images" → **09_spatial** **09_spatial** → "Training is too slow" → **10_dataloader** ## Technical Changes ### Module Directory Renaming - `06_autograd` → `07_autograd` - `07_dataloader` → `10_dataloader` - `08_optimizers` → `06_optimizers` - `10_training` → `08_training` - `09_spatial` → `09_spatial` (no change) ### System Integration Updates - **MODULE_TO_CHECKPOINT mapping**: Updated in tito/commands/export.py - **Test directories**: Renamed module_XX directories to match new numbers - **Documentation**: Updated all references in MD files and agent configurations - **CLI integration**: Updated next-steps suggestions for proper flow ### Agent Configuration Updates - **Quality Assurance**: Updated module audit status with new numbers - **Module Developer**: Updated work tracking with new sequence - **Documentation**: Updated MASTER_PLAN_OF_RECORD.md with beautiful progression ## Educational Benefits 1. **Inevitable Discovery**: Each module naturally leads to the next 2. **Cognitive Load**: Concepts introduced exactly when needed 3. **Motivation**: Students understand WHY each tool is necessary 4. **Synthesis**: Everything flows toward complete ML systems understanding 5. **Professional Alignment**: Matches real ML engineering workflows ## Quality Assurance - ✅ All CLI commands still function - ✅ Checkpoint system mappings updated - ✅ Documentation consistency maintained - ✅ Test directory structure aligned - ✅ Agent configurations synchronized **Impact**: This reordering transforms TinyTorch from a collection of modules into a coherent educational journey where each step naturally motivates the next, creating optimal conditions for deep learning systems understanding.
369 lines
15 KiB
Python
369 lines
15 KiB
Python
"""
|
|
Integration Tests - Attention Pipeline
|
|
|
|
Tests cross-module pipeline interfaces and compatibility.
|
|
Focuses on how attention integrates with other TinyTorch modules to build complete workflows.
|
|
"""
|
|
|
|
import pytest
|
|
import numpy as np
|
|
from test_utils import setup_integration_test
|
|
|
|
# Ensure proper setup before importing
|
|
setup_integration_test()
|
|
|
|
# Import ONLY from TinyTorch package
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.attention import scaled_dot_product_attention, SelfAttention, create_causal_mask
|
|
from tinytorch.core.layers import Dense
|
|
from tinytorch.core.activations import ReLU, Softmax
|
|
from tinytorch.core.dense import Sequential
|
|
|
|
|
|
class TestAttentionDensePipelineInterface:
|
|
"""Test interface compatibility between Attention and Dense modules."""
|
|
|
|
def test_attention_output_to_dense_input(self):
|
|
"""Test that attention output can be used as Dense layer input."""
|
|
seq_len, d_model = 6, 16
|
|
|
|
# Create attention and dense components
|
|
self_attn = SelfAttention(d_model)
|
|
dense = Dense(input_size=d_model, output_size=10)
|
|
|
|
# Create input
|
|
x = Tensor(np.random.randn(seq_len, d_model))
|
|
|
|
# Test pipeline interface: Attention → Dense
|
|
attn_output, _ = self_attn(x.data)
|
|
|
|
# Test that attention output can feed into dense layer
|
|
for i in range(seq_len):
|
|
pos_input = Tensor(attn_output[i:i+1]) # Single position
|
|
dense_output = dense(pos_input)
|
|
|
|
# Verify interface compatibility
|
|
assert isinstance(dense_output, Tensor), "Dense should accept attention output as Tensor"
|
|
assert dense_output.shape == (1, 10), "Dense should process attention output correctly"
|
|
|
|
def test_attention_sequential_compatibility(self):
|
|
"""Test that attention can be integrated into Sequential pipelines."""
|
|
d_model = 8
|
|
|
|
# Test if we can build: Tensor → Dense → Attention-style processing
|
|
input_tensor = Tensor(np.random.randn(4, 6))
|
|
|
|
# Step 1: Dense layer to project to d_model
|
|
projection = Dense(input_size=6, output_size=d_model)
|
|
projected = projection(input_tensor)
|
|
|
|
# Step 2: Attention processing (simulating attention in pipeline)
|
|
self_attn = SelfAttention(d_model)
|
|
attn_output, _ = self_attn(projected.data)
|
|
|
|
# Step 3: Back to Dense layer
|
|
output_projection = Dense(input_size=d_model, output_size=3)
|
|
final_outputs = []
|
|
for i in range(4):
|
|
pos_input = Tensor(attn_output[i:i+1])
|
|
pos_output = output_projection(pos_input)
|
|
final_outputs.append(pos_output.data)
|
|
|
|
final_result = np.concatenate(final_outputs, axis=0)
|
|
|
|
# Verify pipeline interface works
|
|
assert final_result.shape == (4, 3), "Complete pipeline should work"
|
|
assert not np.any(np.isnan(final_result)), "Pipeline should produce valid outputs"
|
|
|
|
def test_attention_with_activation_integration(self):
|
|
"""Test attention integration with activation functions."""
|
|
seq_len, d_model = 5, 12
|
|
|
|
# Create components
|
|
self_attn = SelfAttention(d_model)
|
|
relu = ReLU()
|
|
dense = Dense(input_size=d_model, output_size=d_model)
|
|
|
|
# Test pipeline: Input → Attention → Activation → Dense
|
|
x = Tensor(np.random.randn(seq_len, d_model))
|
|
|
|
# Attention step
|
|
attn_output, _ = self_attn(x.data)
|
|
|
|
# Process each position through activation and dense
|
|
for i in range(seq_len):
|
|
# Attention → Tensor → Activation → Dense pipeline
|
|
pos_tensor = Tensor(attn_output[i:i+1])
|
|
activated = relu(pos_tensor)
|
|
dense_output = dense(activated)
|
|
|
|
# Verify cross-module interface
|
|
assert isinstance(activated, Tensor), "Activation should work with attention output"
|
|
assert isinstance(dense_output, Tensor), "Dense should work after activation"
|
|
assert dense_output.shape == (1, d_model), "Pipeline should preserve expected shapes"
|
|
|
|
|
|
class TestAttentionMultiModuleWorkflows:
|
|
"""Test attention in multi-module workflows and architectures."""
|
|
|
|
def test_encoder_decoder_interface_pattern(self):
|
|
"""Test encoder-decoder pattern using multiple TinyTorch modules."""
|
|
src_len, tgt_len, d_model = 6, 4, 16
|
|
|
|
# Source processing (encoder-style)
|
|
src = Tensor(np.random.randn(src_len, d_model))
|
|
src_projection = Dense(input_size=d_model, output_size=d_model)
|
|
src_projected = src_projection(src)
|
|
|
|
encoder_attn = SelfAttention(d_model)
|
|
encoded, _ = encoder_attn(src_projected.data)
|
|
|
|
# Target processing (decoder-style)
|
|
tgt = Tensor(np.random.randn(tgt_len, d_model))
|
|
tgt_projection = Dense(input_size=d_model, output_size=d_model)
|
|
tgt_projected = tgt_projection(tgt)
|
|
|
|
# Cross-attention interface test
|
|
cross_output, _ = scaled_dot_product_attention(
|
|
tgt_projected.data, # Queries from target
|
|
encoded, # Keys from encoder
|
|
encoded # Values from encoder
|
|
)
|
|
|
|
# Final processing
|
|
output_projection = Dense(input_size=d_model, output_size=10)
|
|
final_outputs = []
|
|
for i in range(tgt_len):
|
|
pos_input = Tensor(cross_output[i:i+1])
|
|
pos_output = output_projection(pos_input)
|
|
final_outputs.append(pos_output.data)
|
|
|
|
final_result = np.concatenate(final_outputs, axis=0)
|
|
|
|
# Verify multi-module workflow
|
|
assert final_result.shape == (tgt_len, 10), "Encoder-decoder workflow should work"
|
|
assert not np.any(np.isnan(final_result)), "Multi-module workflow should be stable"
|
|
|
|
def test_multi_layer_attention_with_residuals(self):
|
|
"""Test multi-layer attention with residual connections using multiple modules."""
|
|
seq_len, d_model = 8, 20
|
|
num_layers = 3
|
|
|
|
# Initial processing
|
|
x = Tensor(np.random.randn(seq_len, d_model))
|
|
embedding_projection = Dense(input_size=d_model, output_size=d_model)
|
|
current_repr = embedding_projection(x).data
|
|
|
|
# Multi-layer processing with residuals
|
|
for layer in range(num_layers):
|
|
# Self-attention
|
|
attn = SelfAttention(d_model)
|
|
attn_output, _ = attn(current_repr)
|
|
|
|
# Feedforward network (using Dense layers)
|
|
ff_network = Sequential([
|
|
Dense(input_size=d_model, output_size=d_model * 2),
|
|
ReLU(),
|
|
Dense(input_size=d_model * 2, output_size=d_model)
|
|
])
|
|
|
|
# Process each position through feedforward
|
|
ff_outputs = []
|
|
for i in range(seq_len):
|
|
pos_input = Tensor(attn_output[i:i+1])
|
|
pos_output = ff_network(pos_input)
|
|
ff_outputs.append(pos_output.data)
|
|
|
|
ff_result = np.concatenate(ff_outputs, axis=0)
|
|
|
|
# Residual connection (attention + feedforward)
|
|
current_repr = attn_output + ff_result
|
|
|
|
# Verify multi-layer integration
|
|
assert current_repr.shape == (seq_len, d_model), "Multi-layer should preserve shape"
|
|
assert not np.any(np.isnan(current_repr)), "Multi-layer integration should be stable"
|
|
|
|
def test_attention_classification_pipeline(self):
|
|
"""Test attention in classification pipeline with multiple modules."""
|
|
seq_len, d_model, num_classes = 10, 24, 5
|
|
|
|
# Input processing
|
|
sentence = Tensor(np.random.randn(seq_len, d_model))
|
|
input_projection = Dense(input_size=d_model, output_size=d_model)
|
|
projected_input = input_projection(sentence)
|
|
|
|
# Attention processing
|
|
self_attn = SelfAttention(d_model)
|
|
attended_seq, _ = self_attn(projected_input.data)
|
|
|
|
# Global pooling (sequence → single representation)
|
|
pooled_repr = np.mean(attended_seq, axis=0, keepdims=True)
|
|
|
|
# Classification head (using Sequential)
|
|
classifier = Sequential([
|
|
Dense(input_size=d_model, output_size=d_model // 2),
|
|
ReLU(),
|
|
Dense(input_size=d_model // 2, output_size=num_classes)
|
|
])
|
|
|
|
# Final classification
|
|
pooled_tensor = Tensor(pooled_repr)
|
|
class_scores = classifier(pooled_tensor)
|
|
|
|
# Verify classification pipeline
|
|
assert class_scores.shape == (1, num_classes), "Classification pipeline should work"
|
|
assert isinstance(class_scores, Tensor), "Pipeline should produce Tensor output"
|
|
|
|
|
|
class TestAttentionDataFlowCompatibility:
|
|
"""Test data flow compatibility between attention and other modules."""
|
|
|
|
def test_shape_preservation_across_modules(self):
|
|
"""Test that shapes flow correctly between attention and other modules."""
|
|
batch_configs = [
|
|
(4, 8), # Small sequence
|
|
(16, 32), # Medium sequence
|
|
(8, 64), # Large model dimension
|
|
]
|
|
|
|
for seq_len, d_model in batch_configs:
|
|
# Input
|
|
x = Tensor(np.random.randn(seq_len, d_model))
|
|
|
|
# Processing pipeline
|
|
input_proj = Dense(input_size=d_model, output_size=d_model)
|
|
projected = input_proj(x)
|
|
|
|
attn = SelfAttention(d_model)
|
|
attn_out, _ = attn(projected.data)
|
|
|
|
output_proj = Dense(input_size=d_model, output_size=d_model // 2)
|
|
|
|
# Test shape flow
|
|
for i in range(seq_len):
|
|
pos_tensor = Tensor(attn_out[i:i+1])
|
|
final_out = output_proj(pos_tensor)
|
|
|
|
# Verify shape compatibility
|
|
assert final_out.shape == (1, d_model // 2), f"Shape flow failed for config {(seq_len, d_model)}"
|
|
|
|
def test_dtype_preservation_across_modules(self):
|
|
"""Test that data types are preserved across attention and other modules."""
|
|
seq_len, d_model = 6, 16
|
|
|
|
# Test float32 flow
|
|
x_f32 = Tensor(np.random.randn(seq_len, d_model).astype(np.float32))
|
|
|
|
dense_f32 = Dense(input_size=d_model, output_size=d_model)
|
|
projected_f32 = dense_f32(x_f32)
|
|
|
|
attn_f32 = SelfAttention(d_model)
|
|
attn_out_f32, _ = attn_f32(projected_f32.data)
|
|
|
|
# Verify dtype flow
|
|
assert projected_f32.dtype == np.float32, "Dense should preserve float32"
|
|
assert attn_out_f32.dtype == np.float32, "Attention should preserve float32"
|
|
|
|
# Test conversion back to Tensor
|
|
result_tensor_f32 = Tensor(attn_out_f32)
|
|
assert result_tensor_f32.dtype == np.float32, "Tensor creation should preserve float32"
|
|
|
|
def test_error_handling_across_modules(self):
|
|
"""Test error handling when modules are incompatibly connected."""
|
|
# Test dimension mismatch between attention and dense
|
|
seq_len = 4
|
|
attn_dim = 8
|
|
dense_dim = 16 # Intentional mismatch
|
|
|
|
x = Tensor(np.random.randn(seq_len, attn_dim))
|
|
attn = SelfAttention(attn_dim)
|
|
attn_out, _ = attn(x.data)
|
|
|
|
# This should fail gracefully
|
|
incompatible_dense = Dense(input_size=dense_dim, output_size=10)
|
|
|
|
try:
|
|
pos_tensor = Tensor(attn_out[0:1]) # Shape (1, 8)
|
|
result = incompatible_dense(pos_tensor) # Expects (1, 16)
|
|
assert False, "Should have failed with dimension mismatch"
|
|
except (ValueError, AssertionError, TypeError) as e:
|
|
# Expected behavior - should fail with clear error
|
|
assert isinstance(e, (ValueError, AssertionError, TypeError)), "Should fail gracefully with incompatible dimensions"
|
|
|
|
|
|
class TestAttentionSystemLevelIntegration:
|
|
"""Test system-level integration scenarios."""
|
|
|
|
def test_complete_transformer_block_simulation(self):
|
|
"""Test simulation of complete transformer block using TinyTorch modules."""
|
|
seq_len, d_model = 8, 32
|
|
|
|
# Input
|
|
x = Tensor(np.random.randn(seq_len, d_model))
|
|
|
|
# Transformer block simulation
|
|
# 1. Self-attention
|
|
self_attn = SelfAttention(d_model)
|
|
attn_out, _ = self_attn(x.data)
|
|
|
|
# 2. Residual connection (attention + input)
|
|
attn_residual = attn_out + x.data
|
|
|
|
# 3. Feedforward network
|
|
ff_net = Sequential([
|
|
Dense(input_size=d_model, output_size=d_model * 4),
|
|
ReLU(),
|
|
Dense(input_size=d_model * 4, output_size=d_model)
|
|
])
|
|
|
|
# Process each position through feedforward
|
|
ff_outputs = []
|
|
for i in range(seq_len):
|
|
pos_input = Tensor(attn_residual[i:i+1])
|
|
pos_output = ff_net(pos_input)
|
|
ff_outputs.append(pos_output.data)
|
|
|
|
ff_result = np.concatenate(ff_outputs, axis=0)
|
|
|
|
# 4. Second residual connection
|
|
final_output = attn_residual + ff_result
|
|
|
|
# Verify complete transformer block simulation
|
|
assert final_output.shape == (seq_len, d_model), "Transformer block should preserve shape"
|
|
assert not np.any(np.isnan(final_output)), "Transformer block should be stable"
|
|
|
|
# Test that output can be used for next layer
|
|
next_attn = SelfAttention(d_model)
|
|
next_out, _ = next_attn(final_output)
|
|
assert next_out.shape == (seq_len, d_model), "Should be stackable"
|
|
|
|
def test_modular_component_replacement(self):
|
|
"""Test that attention components can be replaced modularly."""
|
|
seq_len, d_model = 6, 16
|
|
|
|
x = Tensor(np.random.randn(seq_len, d_model))
|
|
|
|
# Pipeline with different attention configurations
|
|
attention_variants = [
|
|
SelfAttention(d_model),
|
|
SelfAttention(d_model), # Different instance
|
|
SelfAttention(d_model), # Another instance
|
|
]
|
|
|
|
dense_postprocess = Dense(input_size=d_model, output_size=8)
|
|
|
|
# Test that all variants work in same pipeline
|
|
for i, attn_variant in enumerate(attention_variants):
|
|
attn_out, _ = attn_variant(x.data)
|
|
|
|
# Process first position
|
|
pos_tensor = Tensor(attn_out[0:1])
|
|
result = dense_postprocess(pos_tensor)
|
|
|
|
# Verify modular replacement works
|
|
assert result.shape == (1, 8), f"Attention variant {i} should work in pipeline"
|
|
assert isinstance(result, Tensor), f"Attention variant {i} should produce Tensor output"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__]) |