Fix gradient flow tests - use tensor operations for loss

The tests were creating losses incorrectly by breaking the computation graph:
  WRONG: loss = Tensor(np.sum(output.data))  # Breaks graph!
  RIGHT: loss = output.sum()  # Maintains graph

Fixed:
- test_cnn_integration.py: Conv2d and CNN gradient tests
- test_nlp_pipeline_flow.py: Attention gradient tests
- Removed xfail marker from attention test (now passing)

The underlying Conv2d and Attention implementations were correct all along.
This commit is contained in:
Vijay Janapa Reddi
2025-12-02 22:22:51 -05:00
parent d9633041e1
commit 6546b56c23
2 changed files with 8 additions and 7 deletions

View File

@@ -199,7 +199,8 @@ class TestCNNGradientFlow:
output = conv.forward(x)
# Create a simple loss (sum of all outputs)
loss = Tensor(np.sum(output.data))
# IMPORTANT: Use tensor operation to maintain computation graph!
loss = output.sum()
# Backward pass
loss.backward()
@@ -242,8 +243,8 @@ class TestCNNGradientFlow:
conv2.weight.requires_grad = True
out3 = conv2.forward(out2)
# Loss
loss = Tensor(np.mean(out3.data))
# Loss - use tensor operation to maintain computation graph
loss = out3.sum()
# Backward
loss.backward()

View File

@@ -138,8 +138,8 @@ class TestAttentionGradientFlow:
# Forward pass (self-attention - single input for Q, K, V)
output = attention.forward(x)
# Simple loss
loss = Tensor(np.array([[output.data.sum()]]), requires_grad=True)
# Simple loss - use tensor operation to maintain computation graph
loss = output.sum()
loss.backward()
# All projection matrices should have gradients
@@ -152,7 +152,6 @@ class TestAttentionGradientFlow:
f"{proj_name} did not receive gradients!"
)
@pytest.mark.xfail(reason="Known issue: Attention gradient flow needs fix - see Module 12")
def test_attention_input_receives_gradients(self):
"""Input to attention must receive gradients for residual connections"""
try:
@@ -171,7 +170,8 @@ class TestAttentionGradientFlow:
)
output = attention.forward(x)
loss = Tensor(np.array([[output.data.sum()]]), requires_grad=True)
# Use tensor operation to maintain computation graph
loss = output.sum()
loss.backward()
assert x.grad is not None, (