mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-04 08:38:59 -05:00
Test Cleanup (113 files, -22,000 lines): - Remove 21 redundant run_all_tests.py files - Remove checkpoints/ folder (22 obsolete checkpoint files) - Remove progressive/, debugging/, diagnostic/ folders - Remove duplicate integration tests and examples - Remove orphaned dev artifacts and generated outputs - Consolidate test_gradient_flow_overall.py into system/ Documentation Cleanup (4 files removed): - Remove duplicate HOW_TO_USE.md, WORKFLOW.md, SYSTEM_DESIGN.md - Trim environment/README.md from 334 to 86 lines - Update capstone/README.md removing outdated bug references Test Fixes: - Add requires_grad=True to layer parameters in gradient tests - Fix PositionalEncoding argument order in test_shapes.py - Adjust performance thresholds for realistic expectations - Fix gradient clipping to handle memoryview correctly - Update zero_grad assertions to accept None or zeros
318 lines
9.9 KiB
Python
318 lines
9.9 KiB
Python
"""
|
|
NLP Pipeline Flow Integration Tests
|
|
====================================
|
|
|
|
Tests that the NLP pipeline works end-to-end:
|
|
1. Tokenization produces valid token IDs
|
|
2. Embeddings convert tokens to vectors
|
|
3. Attention mechanisms process sequences
|
|
4. Transformers combine everything correctly
|
|
5. Gradients flow back through the entire pipeline
|
|
|
|
These tests catch issues at module boundaries in the NLP stack.
|
|
|
|
Modules tested: 10-13 (Tokenization → Embeddings → Attention → Transformers)
|
|
"""
|
|
|
|
import pytest
|
|
import numpy as np
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add project root
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.autograd import enable_autograd
|
|
from tinytorch.core.embeddings import Embedding
|
|
from tinytorch.core.attention import MultiHeadAttention
|
|
from tinytorch.core.transformers import TransformerBlock
|
|
from tinytorch.core.layers import Linear
|
|
from tinytorch.core.losses import CrossEntropyLoss
|
|
|
|
# Enable autograd
|
|
enable_autograd()
|
|
|
|
|
|
class TestEmbeddingGradientFlow:
|
|
"""
|
|
Critical Test: Verify gradients flow through embeddings.
|
|
|
|
Common bugs caught:
|
|
- Embedding lookup not differentiable
|
|
- Wrong gradient accumulation for repeated tokens
|
|
- Shape mismatches between embedding and attention
|
|
"""
|
|
|
|
def test_embedding_receives_gradients(self):
|
|
"""Embedding weights must receive gradients during training"""
|
|
vocab_size = 100
|
|
embed_dim = 32
|
|
embedding = Embedding(vocab_size, embed_dim)
|
|
|
|
# Enable gradient tracking on embedding weights
|
|
embedding.weight.requires_grad = True
|
|
|
|
# Token IDs (as Tensor)
|
|
token_ids = Tensor(np.array([1, 5, 3, 7, 2]))
|
|
|
|
# Forward pass
|
|
embedded = embedding.forward(token_ids)
|
|
|
|
# Simple loss: sum of embeddings using Tensor operation to preserve graph
|
|
loss = embedded.sum()
|
|
loss.backward()
|
|
|
|
# Embedding weights should have gradients
|
|
assert embedding.weight.grad is not None, (
|
|
"Embedding weights did not receive gradients!"
|
|
)
|
|
|
|
# Only used token embeddings should have non-zero gradients
|
|
for token_id in [1, 5, 3, 7, 2]: # Use raw values instead of iterating tensor
|
|
grad_row = embedding.weight.grad[token_id]
|
|
assert np.any(grad_row != 0), (
|
|
f"Token {token_id} embedding has zero gradient!"
|
|
)
|
|
|
|
def test_repeated_tokens_accumulate_gradients(self):
|
|
"""Same token appearing twice should have accumulated gradient"""
|
|
vocab_size = 10
|
|
embed_dim = 4
|
|
embedding = Embedding(vocab_size, embed_dim)
|
|
|
|
# Enable gradient tracking on embedding weights
|
|
embedding.weight.requires_grad = True
|
|
|
|
# Token 5 appears twice (as Tensor)
|
|
token_ids = Tensor(np.array([5, 2, 5, 3]))
|
|
|
|
embedded = embedding.forward(token_ids)
|
|
|
|
# Loss that weights all positions equally using Tensor operation
|
|
loss = embedded.sum()
|
|
loss.backward()
|
|
|
|
# Token 5 should have ~2x the gradient of token 2 or 3
|
|
grad_5 = np.linalg.norm(embedding.weight.grad[5])
|
|
grad_2 = np.linalg.norm(embedding.weight.grad[2])
|
|
|
|
# Allow some tolerance
|
|
assert grad_5 > grad_2 * 1.5, (
|
|
f"Repeated token gradient not accumulated!\n"
|
|
f" Token 5 (appears 2x) grad: {grad_5}\n"
|
|
f" Token 2 (appears 1x) grad: {grad_2}\n"
|
|
f" Expected ratio ~2, got {grad_5/grad_2:.2f}"
|
|
)
|
|
|
|
|
|
class TestAttentionGradientFlow:
|
|
"""
|
|
Critical Test: Verify gradients flow through attention mechanism.
|
|
|
|
Common bugs caught:
|
|
- Softmax gradient issues
|
|
- Attention weights not differentiable
|
|
- Query/Key/Value projection gradients
|
|
"""
|
|
|
|
def test_attention_all_projections_receive_gradients(self):
|
|
"""Q, K, V projections must all receive gradients"""
|
|
embed_dim = 32
|
|
num_heads = 4
|
|
seq_len = 8
|
|
batch_size = 2
|
|
|
|
attention = MultiHeadAttention(embed_dim, num_heads)
|
|
|
|
# Random input sequence
|
|
x = Tensor(
|
|
np.random.randn(batch_size, seq_len, embed_dim),
|
|
requires_grad=True
|
|
)
|
|
|
|
# Forward pass (self-attention - single input for Q, K, V)
|
|
output = attention.forward(x)
|
|
|
|
# Simple loss - use tensor operation to maintain computation graph
|
|
loss = output.sum()
|
|
loss.backward()
|
|
|
|
# All projection matrices should have gradients
|
|
projections = ['W_q', 'W_k', 'W_v', 'W_o']
|
|
for proj_name in projections:
|
|
if hasattr(attention, proj_name):
|
|
proj = getattr(attention, proj_name)
|
|
if hasattr(proj, 'weight'):
|
|
assert proj.weight.grad is not None, (
|
|
f"{proj_name} did not receive gradients!"
|
|
)
|
|
|
|
def test_attention_input_receives_gradients(self):
|
|
"""Input to attention must receive gradients for residual connections"""
|
|
embed_dim = 16
|
|
num_heads = 2
|
|
|
|
attention = MultiHeadAttention(embed_dim, num_heads)
|
|
|
|
x = Tensor(
|
|
np.random.randn(1, 4, embed_dim),
|
|
requires_grad=True
|
|
)
|
|
|
|
output = attention.forward(x)
|
|
# Use tensor operation to maintain computation graph
|
|
loss = output.sum()
|
|
loss.backward()
|
|
|
|
assert x.grad is not None, (
|
|
"Input to attention did not receive gradients!\n"
|
|
"This breaks residual connections in Transformers."
|
|
)
|
|
|
|
assert x.grad.shape == x.shape, (
|
|
f"Input gradient shape mismatch: {x.grad.shape} vs {x.shape}"
|
|
)
|
|
|
|
|
|
class TestTransformerGradientFlow:
|
|
"""
|
|
Critical Test: Verify gradients flow through complete Transformer.
|
|
|
|
Common bugs caught:
|
|
- Residual connection gradients
|
|
- Layer norm gradient issues
|
|
- Deep network vanishing gradients
|
|
"""
|
|
|
|
def test_transformer_block_gradient_flow(self):
|
|
"""Gradients must flow through a complete transformer block"""
|
|
embed_dim = 32
|
|
num_heads = 4
|
|
ff_dim = 64
|
|
|
|
block = TransformerBlock(embed_dim, num_heads, ff_dim)
|
|
|
|
x = Tensor(
|
|
np.random.randn(1, 8, embed_dim),
|
|
requires_grad=True
|
|
)
|
|
|
|
output = block.forward(x)
|
|
# Use Tensor operation to preserve computation graph
|
|
loss = output.sum()
|
|
loss.backward()
|
|
|
|
# Input must receive gradients (for stacking blocks)
|
|
assert x.grad is not None, (
|
|
"Transformer block input did not receive gradients!"
|
|
)
|
|
|
|
# Gradient should not be too small (vanishing)
|
|
grad_norm = np.linalg.norm(x.grad)
|
|
assert grad_norm > 1e-6, (
|
|
f"Vanishing gradients in transformer block: {grad_norm}"
|
|
)
|
|
|
|
def test_stacked_transformer_blocks(self):
|
|
"""Gradients must flow through multiple stacked blocks"""
|
|
embed_dim = 32
|
|
num_heads = 4
|
|
ff_dim = 64
|
|
num_layers = 4
|
|
|
|
blocks = [TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_layers)]
|
|
|
|
x = Tensor(
|
|
np.random.randn(1, 8, embed_dim),
|
|
requires_grad=True
|
|
)
|
|
|
|
# Forward through all blocks
|
|
h = x
|
|
for block in blocks:
|
|
h = block.forward(h)
|
|
|
|
# Use Tensor operation to preserve computation graph
|
|
loss = h.sum()
|
|
loss.backward()
|
|
|
|
# Input must receive gradients through all layers
|
|
assert x.grad is not None, (
|
|
f"Gradients did not flow through {num_layers} transformer blocks!"
|
|
)
|
|
|
|
# Check gradient magnitude is reasonable
|
|
grad_norm = np.linalg.norm(x.grad)
|
|
assert grad_norm > 1e-8, (
|
|
f"Severe vanishing gradients through {num_layers} blocks: {grad_norm}"
|
|
)
|
|
|
|
|
|
class TestNLPPipelineEndToEnd:
|
|
"""
|
|
Integration Test: Full NLP pipeline from tokens to loss.
|
|
|
|
This tests the complete flow:
|
|
tokens → embedding → attention → linear → loss
|
|
"""
|
|
|
|
def test_complete_nlp_forward_backward(self):
|
|
"""Complete NLP pipeline must work end-to-end"""
|
|
vocab_size = 100
|
|
embed_dim = 32
|
|
num_heads = 4
|
|
num_classes = 10
|
|
seq_len = 8
|
|
batch_size = 1
|
|
|
|
# Build pipeline
|
|
embedding = Embedding(vocab_size, embed_dim)
|
|
embedding.weight.requires_grad = True
|
|
attention = MultiHeadAttention(embed_dim, num_heads)
|
|
classifier = Linear(embed_dim, num_classes)
|
|
classifier.weight.requires_grad = True
|
|
loss_fn = CrossEntropyLoss()
|
|
|
|
# Input: token IDs (as Tensor) - shape (batch_size, seq_len)
|
|
token_ids = Tensor(np.random.randint(0, vocab_size, (batch_size, seq_len)))
|
|
target = Tensor(np.array([3])) # Class 3
|
|
|
|
# Forward pass
|
|
embedded = embedding.forward(token_ids) # [batch_size, seq_len, embed_dim]
|
|
attended = attention.forward(embedded) # [batch_size, seq_len, embed_dim]
|
|
|
|
# Mean pooling over sequence (position 1) - use Tensor operation
|
|
# attended.data.mean(axis=1) gives [batch_size, embed_dim]
|
|
pooled_data = attended.data.mean(axis=1)
|
|
pooled = Tensor(pooled_data, requires_grad=True)
|
|
|
|
logits = classifier.forward(pooled) # [batch_size, num_classes]
|
|
loss = loss_fn.forward(logits, target)
|
|
|
|
# Backward pass
|
|
loss.backward()
|
|
|
|
# Verify classifier received gradients
|
|
assert classifier.weight.grad is not None, (
|
|
"Classifier did not receive gradients!"
|
|
)
|
|
|
|
|
|
# Quick smoke tests for CI
|
|
@pytest.mark.quick
|
|
class TestQuickNLPSmoke:
|
|
"""Fast tests for CI"""
|
|
|
|
def test_embedding_forward_works(self):
|
|
"""Embedding forward should not crash"""
|
|
embedding = Embedding(100, 32)
|
|
indices = Tensor(np.array([1, 2, 3]))
|
|
result = embedding.forward(indices)
|
|
assert result.shape[0] == 3
|
|
assert result.shape[1] == 32
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|