mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-03 12:13:11 -05:00
This commit implements the pedagogically optimal "inevitable discovery" module progression based on expert validation and educational design principles. ## Module Reordering Summary **Previous Order (Problems)**: - 05_losses → 06_autograd → 07_dataloader → 08_optimizers → 09_spatial → 10_training - Issues: Autograd before optimizers, DataLoader before training, scattered dependencies **New Order (Beautiful Progression)**: - 05_losses → 06_optimizers → 07_autograd → 08_training → 09_spatial → 10_dataloader - Benefits: Each module creates inevitable need for the next ## Pedagogical Flow Achieved **05_losses** → "Need systematic weight updates" → **06_optimizers** **06_optimizers** → "Need automatic gradients" → **07_autograd** **07_autograd** → "Need systematic training" → **08_training** **08_training** → "MLPs hit limits on images" → **09_spatial** **09_spatial** → "Training is too slow" → **10_dataloader** ## Technical Changes ### Module Directory Renaming - `06_autograd` → `07_autograd` - `07_dataloader` → `10_dataloader` - `08_optimizers` → `06_optimizers` - `10_training` → `08_training` - `09_spatial` → `09_spatial` (no change) ### System Integration Updates - **MODULE_TO_CHECKPOINT mapping**: Updated in tito/commands/export.py - **Test directories**: Renamed module_XX directories to match new numbers - **Documentation**: Updated all references in MD files and agent configurations - **CLI integration**: Updated next-steps suggestions for proper flow ### Agent Configuration Updates - **Quality Assurance**: Updated module audit status with new numbers - **Module Developer**: Updated work tracking with new sequence - **Documentation**: Updated MASTER_PLAN_OF_RECORD.md with beautiful progression ## Educational Benefits 1. **Inevitable Discovery**: Each module naturally leads to the next 2. **Cognitive Load**: Concepts introduced exactly when needed 3. **Motivation**: Students understand WHY each tool is necessary 4. **Synthesis**: Everything flows toward complete ML systems understanding 5. **Professional Alignment**: Matches real ML engineering workflows ## Quality Assurance - ✅ All CLI commands still function - ✅ Checkpoint system mappings updated - ✅ Documentation consistency maintained - ✅ Test directory structure aligned - ✅ Agent configurations synchronized **Impact**: This reordering transforms TinyTorch from a collection of modules into a coherent educational journey where each step naturally motivates the next, creating optimal conditions for deep learning systems understanding.
336 lines
14 KiB
Python
336 lines
14 KiB
Python
"""
|
|
Module 07: Progressive Integration Tests
|
|
Tests that Module 07 (Attention) works correctly AND that the entire prior stack works.
|
|
|
|
DEPENDENCY CHAIN: 01_setup → 02_tensor → 03_activations → 04_layers → 05_dense → 06_spatial → 07_attention
|
|
This is where attention mechanisms enable sequence understanding.
|
|
"""
|
|
|
|
import numpy as np
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
|
|
class TestPriorStackStillWorking:
|
|
"""Quick regression checks that prior modules (01→06) still work."""
|
|
|
|
def test_foundation_stack_stable(self):
|
|
"""Verify foundation stack (01→05) remains stable."""
|
|
# Environment (Module 01)
|
|
assert sys.version_info >= (3, 8), "Foundation broken: Python version"
|
|
|
|
# Tensor foundation (Module 02)
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
t = Tensor([1, 2, 3])
|
|
assert t.shape == (3,), "Foundation broken: Tensor creation"
|
|
except ImportError:
|
|
assert True, "Tensor foundation not implemented yet"
|
|
|
|
def test_spatial_operations_stable(self):
|
|
"""Verify Module 06 (Spatial) operations still work."""
|
|
try:
|
|
from tinytorch.core.spatial import Conv2D, MaxPool2D
|
|
|
|
# Basic spatial operations should work
|
|
conv = Conv2D(in_channels=3, out_channels=16, kernel_size=3)
|
|
pool = MaxPool2D(kernel_size=2)
|
|
|
|
assert hasattr(conv, 'forward'), "Spatial broken: Conv2D interface"
|
|
assert hasattr(pool, 'forward'), "Spatial broken: MaxPool2D interface"
|
|
|
|
except ImportError:
|
|
assert True, "Spatial operations not implemented yet"
|
|
|
|
|
|
class TestModule07AttentionCore:
|
|
"""Test Module 07 (Attention) core functionality."""
|
|
|
|
def test_attention_mechanism_creation(self):
|
|
"""Test basic attention mechanism works."""
|
|
try:
|
|
from tinytorch.core.attention import MultiHeadAttention
|
|
from tinytorch.core.tensor import Tensor
|
|
|
|
# Create attention mechanism
|
|
attention = MultiHeadAttention(embed_dim=64, num_heads=8)
|
|
|
|
# Should have proper components
|
|
assert hasattr(attention, 'query_proj'), "Attention broken: No query projection"
|
|
assert hasattr(attention, 'key_proj'), "Attention broken: No key projection"
|
|
assert hasattr(attention, 'value_proj'), "Attention broken: No value projection"
|
|
|
|
# Test with sequence input
|
|
seq_len, batch_size, embed_dim = 10, 4, 64
|
|
x = Tensor(np.random.randn(seq_len, batch_size, embed_dim))
|
|
|
|
output = attention(x)
|
|
assert output.shape == (seq_len, batch_size, embed_dim), "Attention output shape broken"
|
|
|
|
except ImportError:
|
|
assert True, "Attention mechanism not implemented yet"
|
|
|
|
def test_scaled_dot_product_attention(self):
|
|
"""Test core attention computation."""
|
|
try:
|
|
from tinytorch.core.attention import scaled_dot_product_attention
|
|
from tinytorch.core.tensor import Tensor
|
|
|
|
# Attention inputs: queries, keys, values
|
|
seq_len, embed_dim = 8, 16
|
|
Q = Tensor(np.random.randn(seq_len, embed_dim))
|
|
K = Tensor(np.random.randn(seq_len, embed_dim))
|
|
V = Tensor(np.random.randn(seq_len, embed_dim))
|
|
|
|
# Compute attention
|
|
output, attention_weights = scaled_dot_product_attention(Q, K, V)
|
|
|
|
assert output.shape == V.shape, "Attention output shape wrong"
|
|
assert attention_weights.shape == (seq_len, seq_len), "Attention weights shape wrong"
|
|
|
|
# Attention weights should sum to 1 across keys
|
|
weight_sums = np.sum(attention_weights.data, axis=1)
|
|
assert np.allclose(weight_sums, 1.0), "Attention weights don't sum to 1"
|
|
|
|
except ImportError:
|
|
assert True, "Scaled dot-product attention not implemented yet"
|
|
|
|
|
|
class TestProgressiveStackIntegration:
|
|
"""Test that the complete stack (01→07) works together."""
|
|
|
|
def test_neural_network_with_attention(self):
|
|
"""Test neural network enhanced with attention."""
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.layers import Dense
|
|
from tinytorch.core.activations import ReLU
|
|
from tinytorch.core.attention import MultiHeadAttention
|
|
|
|
# Build network: dense → attention → dense
|
|
encoder = Dense(64, 64)
|
|
attention = MultiHeadAttention(embed_dim=64, num_heads=8)
|
|
decoder = Dense(64, 10)
|
|
relu = ReLU()
|
|
|
|
# Sequence input
|
|
seq_len, batch_size, input_dim = 12, 4, 64
|
|
x = Tensor(np.random.randn(seq_len, batch_size, input_dim))
|
|
|
|
# Forward pass through network with attention
|
|
h = relu(encoder(x)) # Dense processing
|
|
attn_out = attention(h) # Attention mechanism
|
|
output = decoder(attn_out) # Final projection
|
|
|
|
assert output.shape == (seq_len, batch_size, 10), "Network with attention broken"
|
|
|
|
except ImportError:
|
|
assert True, "Neural network with attention not ready yet"
|
|
|
|
def test_transformer_block_capability(self):
|
|
"""Test building transformer-style blocks."""
|
|
try:
|
|
from tinytorch.core.attention import MultiHeadAttention
|
|
from tinytorch.core.layers import Dense
|
|
from tinytorch.core.activations import ReLU
|
|
from tinytorch.core.tensor import Tensor
|
|
|
|
# Transformer block components
|
|
attention = MultiHeadAttention(embed_dim=128, num_heads=8)
|
|
ff1 = Dense(128, 512)
|
|
ff2 = Dense(512, 128)
|
|
relu = ReLU()
|
|
|
|
# Input sequence
|
|
seq_len, batch_size, embed_dim = 16, 2, 128
|
|
x = Tensor(np.random.randn(seq_len, batch_size, embed_dim))
|
|
|
|
# Transformer block: attention + feedforward
|
|
attn_out = attention(x)
|
|
ff_out = ff2(relu(ff1(attn_out)))
|
|
|
|
# Residual connection (if implemented)
|
|
if hasattr(x, '__add__'):
|
|
output = x + ff_out # Residual connection
|
|
else:
|
|
output = ff_out
|
|
|
|
assert output.shape == x.shape, "Transformer block broken"
|
|
|
|
except ImportError:
|
|
assert True, "Transformer block capability not ready yet"
|
|
|
|
|
|
class TestSequenceUnderstandingCapability:
|
|
"""Test that attention enables sequence understanding."""
|
|
|
|
def test_sequence_to_sequence_capability(self):
|
|
"""Test sequence-to-sequence processing."""
|
|
try:
|
|
from tinytorch.core.attention import MultiHeadAttention
|
|
from tinytorch.core.tensor import Tensor
|
|
|
|
# Encoder-decoder style processing
|
|
encoder_attention = MultiHeadAttention(embed_dim=64, num_heads=4)
|
|
decoder_attention = MultiHeadAttention(embed_dim=64, num_heads=4)
|
|
|
|
# Source and target sequences
|
|
src_len, tgt_len, batch_size, embed_dim = 10, 8, 2, 64
|
|
src = Tensor(np.random.randn(src_len, batch_size, embed_dim))
|
|
tgt = Tensor(np.random.randn(tgt_len, batch_size, embed_dim))
|
|
|
|
# Encode source sequence
|
|
encoded = encoder_attention(src)
|
|
|
|
# Decode target sequence (with potential cross-attention)
|
|
if hasattr(decoder_attention, 'cross_attention'):
|
|
decoded = decoder_attention(tgt, encoded)
|
|
else:
|
|
decoded = decoder_attention(tgt)
|
|
|
|
assert encoded.shape == src.shape, "Sequence encoding broken"
|
|
assert decoded.shape == tgt.shape, "Sequence decoding broken"
|
|
|
|
except ImportError:
|
|
assert True, "Sequence-to-sequence not ready yet"
|
|
|
|
def test_attention_pattern_analysis(self):
|
|
"""Test that attention creates meaningful patterns."""
|
|
try:
|
|
from tinytorch.core.attention import scaled_dot_product_attention
|
|
from tinytorch.core.tensor import Tensor
|
|
|
|
# Create sequence with clear patterns
|
|
seq_len, embed_dim = 6, 8
|
|
|
|
# Pattern: first and last tokens should attend to each other
|
|
pattern_input = np.zeros((seq_len, embed_dim))
|
|
pattern_input[0, :] = 1.0 # First token
|
|
pattern_input[-1, :] = 1.0 # Last token
|
|
|
|
Q = Tensor(pattern_input)
|
|
K = Tensor(pattern_input)
|
|
V = Tensor(pattern_input)
|
|
|
|
output, attention_weights = scaled_dot_product_attention(Q, K, V)
|
|
|
|
# Check attention patterns make sense
|
|
# First token should attend strongly to last token
|
|
first_to_last = attention_weights.data[0, -1]
|
|
last_to_first = attention_weights.data[-1, 0]
|
|
|
|
# These should be among the highest attention weights
|
|
assert first_to_last > 0.1, "Attention pattern not detected"
|
|
assert last_to_first > 0.1, "Attention pattern not detected"
|
|
|
|
except ImportError:
|
|
assert True, "Attention pattern analysis not ready yet"
|
|
|
|
|
|
class TestNLPReadiness:
|
|
"""Test readiness for NLP applications."""
|
|
|
|
def test_language_modeling_architecture(self):
|
|
"""Test architecture suitable for language modeling."""
|
|
try:
|
|
from tinytorch.core.attention import MultiHeadAttention
|
|
from tinytorch.core.layers import Dense
|
|
from tinytorch.core.tensor import Tensor
|
|
|
|
# Language model components
|
|
vocab_size, embed_dim, seq_len = 1000, 256, 32
|
|
|
|
# Embedding layer (simplified)
|
|
embedding = Dense(vocab_size, embed_dim)
|
|
|
|
# Attention layers
|
|
attention1 = MultiHeadAttention(embed_dim=embed_dim, num_heads=8)
|
|
attention2 = MultiHeadAttention(embed_dim=embed_dim, num_heads=8)
|
|
|
|
# Output projection
|
|
output_proj = Dense(embed_dim, vocab_size)
|
|
|
|
# Token sequence (as embeddings)
|
|
batch_size = 4
|
|
tokens = Tensor(np.random.randint(0, vocab_size, (seq_len, batch_size)))
|
|
|
|
# Simple embedding lookup (simplified)
|
|
if hasattr(embedding, 'embedding_lookup'):
|
|
x = embedding.embedding_lookup(tokens)
|
|
else:
|
|
# Simplified: random embeddings
|
|
x = Tensor(np.random.randn(seq_len, batch_size, embed_dim))
|
|
|
|
# Transformer layers
|
|
h1 = attention1(x)
|
|
h2 = attention2(h1)
|
|
|
|
# Output logits
|
|
logits = output_proj(h2)
|
|
|
|
assert logits.shape == (seq_len, batch_size, vocab_size), "Language model architecture broken"
|
|
|
|
except ImportError:
|
|
assert True, "Language modeling architecture not ready yet"
|
|
|
|
|
|
class TestRegressionPrevention:
|
|
"""Ensure previous modules still work after Module 07 development."""
|
|
|
|
def test_no_foundation_regression(self):
|
|
"""Verify foundation stack (01→05) unchanged."""
|
|
# Environment should remain stable
|
|
assert sys.version_info.major >= 3, "Foundation: Python detection broken"
|
|
|
|
# Project structure should remain intact
|
|
project_root = Path(__file__).parent.parent.parent
|
|
assert project_root.exists(), "Foundation: Project structure broken"
|
|
|
|
def test_no_spatial_regression(self):
|
|
"""Verify spatial operations (Module 06) unchanged."""
|
|
try:
|
|
from tinytorch.core.spatial import Conv2D
|
|
|
|
# Spatial operations should still work
|
|
conv = Conv2D(in_channels=1, out_channels=8, kernel_size=3)
|
|
assert hasattr(conv, 'forward'), "Spatial regression: Conv2D broken"
|
|
|
|
except ImportError:
|
|
# If not implemented, that's fine
|
|
# But numpy should still work (from foundation)
|
|
import numpy as np
|
|
arr = np.array([1, 2, 3])
|
|
assert arr.shape == (3,), "Spatial regression: Numpy foundation broken"
|
|
|
|
def test_progressive_stability(self):
|
|
"""Test the progressive stack is stable through attention."""
|
|
# Stack should be stable through: Setup → Tensor → Activations → Layers → Dense → Spatial → Attention
|
|
|
|
# Setup level
|
|
import numpy as np
|
|
assert np is not None, "Setup level broken"
|
|
|
|
# Foundation level (if available)
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.layers import Dense
|
|
|
|
# Should still be able to build neural networks
|
|
layer = Dense(10, 5)
|
|
x = Tensor(np.random.randn(4, 10))
|
|
output = layer(x)
|
|
assert output.shape == (4, 5), "Foundation level broken"
|
|
|
|
except ImportError:
|
|
pass # Not implemented yet
|
|
|
|
# Attention level (if available)
|
|
try:
|
|
from tinytorch.core.attention import MultiHeadAttention
|
|
attention = MultiHeadAttention(embed_dim=32, num_heads=4)
|
|
assert callable(attention), "Attention level broken"
|
|
except ImportError:
|
|
pass # Not implemented yet |