mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 20:13:39 -05:00
Fix gradient flow tests - use tensor operations for loss
The tests were creating losses incorrectly by breaking the computation graph: WRONG: loss = Tensor(np.sum(output.data)) # Breaks graph! RIGHT: loss = output.sum() # Maintains graph Fixed: - test_cnn_integration.py: Conv2d and CNN gradient tests - test_nlp_pipeline_flow.py: Attention gradient tests - Removed xfail marker from attention test (now passing) The underlying Conv2d and Attention implementations were correct all along.
This commit is contained in:
@@ -199,7 +199,8 @@ class TestCNNGradientFlow:
|
||||
output = conv.forward(x)
|
||||
|
||||
# Create a simple loss (sum of all outputs)
|
||||
loss = Tensor(np.sum(output.data))
|
||||
# IMPORTANT: Use tensor operation to maintain computation graph!
|
||||
loss = output.sum()
|
||||
|
||||
# Backward pass
|
||||
loss.backward()
|
||||
@@ -242,8 +243,8 @@ class TestCNNGradientFlow:
|
||||
conv2.weight.requires_grad = True
|
||||
out3 = conv2.forward(out2)
|
||||
|
||||
# Loss
|
||||
loss = Tensor(np.mean(out3.data))
|
||||
# Loss - use tensor operation to maintain computation graph
|
||||
loss = out3.sum()
|
||||
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
@@ -138,8 +138,8 @@ class TestAttentionGradientFlow:
|
||||
# Forward pass (self-attention - single input for Q, K, V)
|
||||
output = attention.forward(x)
|
||||
|
||||
# Simple loss
|
||||
loss = Tensor(np.array([[output.data.sum()]]), requires_grad=True)
|
||||
# Simple loss - use tensor operation to maintain computation graph
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
# All projection matrices should have gradients
|
||||
@@ -152,7 +152,6 @@ class TestAttentionGradientFlow:
|
||||
f"{proj_name} did not receive gradients!"
|
||||
)
|
||||
|
||||
@pytest.mark.xfail(reason="Known issue: Attention gradient flow needs fix - see Module 12")
|
||||
def test_attention_input_receives_gradients(self):
|
||||
"""Input to attention must receive gradients for residual connections"""
|
||||
try:
|
||||
@@ -171,7 +170,8 @@ class TestAttentionGradientFlow:
|
||||
)
|
||||
|
||||
output = attention.forward(x)
|
||||
loss = Tensor(np.array([[output.data.sum()]]), requires_grad=True)
|
||||
# Use tensor operation to maintain computation graph
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
assert x.grad is not None, (
|
||||
|
||||
Reference in New Issue
Block a user