fix(module-01): Fix batched matmul and transpose grad preservation

- Change np.dot to np.matmul for proper batched 3D tensor multiplication - Add requires_grad preservation in transpose() operation - Fixes attention mechanism gradient flow issues Regression tests added in tests/regression/test_gradient_flow_fixes.py
2026-04-28 02:19:10 -05:00 · 2025-10-27 20:28:53 -04:00
parent de826e0b9d
commit fb753882ec
4 changed files with 329 additions and 75 deletions
--- a/modules/source/01_tensor/tensor_dev.ipynb
+++ b/modules/source/01_tensor/tensor_dev.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "id": "c2a3655c",
+   "id": "e991dad5",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -51,7 +51,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "489aaf93",
+   "id": "bed71914",
   "metadata": {
    "nbgrader": {
     "grade": false,
@@ -69,7 +69,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "3fdd485d",
+   "id": "25222aa1",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -116,7 +116,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "74228e6a",
+   "id": "2cd44f52",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -175,7 +175,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "9f901d71",
+   "id": "852b2eb6",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -214,7 +214,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "10e62754",
+   "id": "79fe2a61",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -252,7 +252,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "7b2d1c7c",
+   "id": "ea76431d",
   "metadata": {
    "lines_to_next_cell": 1,
    "nbgrader": {
@@ -356,21 +356,10 @@
    "        ### BEGIN SOLUTION\n",
    "        if isinstance(other, Tensor):\n",
    "            # Tensor + Tensor: let NumPy handle broadcasting\n",
-    "            result_data = self.data + other.data\n",
+    "            return Tensor(self.data + other.data)\n",
    "        else:\n",
    "            # Tensor + scalar: NumPy broadcasts automatically\n",
-    "            result_data = self.data + other\n",
-    "\n",
-    "        # Create new tensor with result\n",
-    "        result = Tensor(result_data)\n",
-    "\n",
-    "        # Preserve gradient tracking if either operand requires gradients\n",
-    "        if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):\n",
-    "            result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)\n",
-    "        elif hasattr(self, 'requires_grad'):\n",
-    "            result.requires_grad = self.requires_grad\n",
-    "\n",
-    "        return result\n",
+    "            return Tensor(self.data + other)\n",
    "        ### END SOLUTION\n",
    "\n",
    "    # nbgrader={\"grade\": false, \"grade_id\": \"more-arithmetic\", \"solution\": true}\n",
@@ -380,10 +369,12 @@
    "\n",
    "        Common use: Centering data (x - mean), computing differences for loss functions.\n",
    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
    "        if isinstance(other, Tensor):\n",
    "            return Tensor(self.data - other.data)\n",
    "        else:\n",
    "            return Tensor(self.data - other)\n",
+    "        ### END SOLUTION\n",
    "\n",
    "    def __mul__(self, other):\n",
    "        \"\"\"\n",
@@ -392,10 +383,12 @@
    "        Common use: Scaling features, applying masks, gating mechanisms in neural networks.\n",
    "        Note: This is * operator, not @ (which will be matrix multiplication).\n",
    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
    "        if isinstance(other, Tensor):\n",
    "            return Tensor(self.data * other.data)\n",
    "        else:\n",
    "            return Tensor(self.data * other)\n",
+    "        ### END SOLUTION\n",
    "\n",
    "    def __truediv__(self, other):\n",
    "        \"\"\"\n",
@@ -403,10 +396,12 @@
    "\n",
    "        Common use: Normalization (x / std), converting counts to probabilities.\n",
    "        \"\"\"\n",
+    "        ### BEGIN SOLUTION\n",
    "        if isinstance(other, Tensor):\n",
    "            return Tensor(self.data / other.data)\n",
    "        else:\n",
    "            return Tensor(self.data / other)\n",
+    "        ### END SOLUTION\n",
    "\n",
    "    # nbgrader={\"grade\": false, \"grade_id\": \"matmul-impl\", \"solution\": true}\n",
    "    def matmul(self, other):\n",
@@ -475,7 +470,8 @@
    "                )\n",
    "\n",
    "        # Perform optimized matrix multiplication\n",
-    "        result_data = np.dot(self.data, other.data)\n",
+    "        # Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors\n",
+    "        result_data = np.matmul(self.data, other.data)\n",
    "        return Tensor(result_data)\n",
    "        ### END SOLUTION\n",
    "\n",
@@ -547,7 +543,9 @@
    "\n",
    "        # Reshape the data (NumPy handles the memory layout efficiently)\n",
    "        reshaped_data = np.reshape(self.data, new_shape)\n",
-    "        return Tensor(reshaped_data)\n",
+    "        # Preserve gradient tracking from the original tensor (important for autograd!)\n",
+    "        result = Tensor(reshaped_data, requires_grad=self.requires_grad)\n",
+    "        return result\n",
    "        ### END SOLUTION\n",
    "\n",
    "    def transpose(self, dim0=None, dim1=None):\n",
@@ -613,7 +611,9 @@
    "            axes[dim0], axes[dim1] = axes[dim1], axes[dim0]\n",
    "            transposed_data = np.transpose(self.data, axes)\n",
    "\n",
-    "        return Tensor(transposed_data)\n",
+    "        # Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn)\n",
+    "        result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False)\n",
+    "        return result\n",
    "        ### END SOLUTION\n",
    "\n",
    "    # nbgrader={\"grade\": false, \"grade_id\": \"reduction-ops\", \"solution\": true}\n",
@@ -724,7 +724,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "b8fb7404",
+   "id": "28e76b8d",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -742,7 +742,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "e91c24e9",
+   "id": "cfac36f6",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -791,7 +791,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "70f6d355",
+   "id": "c23e49bc",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -839,7 +839,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "ec9dffd0",
+   "id": "ad4a3f8b",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 2
@@ -882,7 +882,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "7840a9e3",
+   "id": "6f8fd64f",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -900,7 +900,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "1254072f",
+   "id": "ce89898f",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -957,7 +957,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "0f6515fd",
+   "id": "55918cd3",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 2
@@ -1057,7 +1057,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "3693277e",
+   "id": "d33d261d",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1075,7 +1075,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "66f41775",
+   "id": "93279707",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -1132,7 +1132,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "da1643af",
+   "id": "2439ca3e",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 2
@@ -1235,7 +1235,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "266f3940",
+   "id": "30ef42fb",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1253,7 +1253,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "18b8ebe6",
+   "id": "8ff5e144",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -1323,7 +1323,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "cabcfbca",
+   "id": "5be42959",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 2
@@ -1417,7 +1417,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "509b3603",
+   "id": "e5824871",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1435,7 +1435,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "38336414",
+   "id": "e35f8cc5",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -1508,7 +1508,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "3de81878",
+   "id": "cf6df213",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 2
@@ -1583,7 +1583,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "2d27893f",
+   "id": "6d368af1",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 2
@@ -1644,7 +1644,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "a69e79e9",
+   "id": "a5c6349f",
   "metadata": {
    "lines_to_next_cell": 1
   },
@@ -1666,7 +1666,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "f2eb291a",
+   "id": "a6a6b03a",
   "metadata": {
    "lines_to_next_cell": 2,
    "nbgrader": {
@@ -1794,7 +1794,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "a0acddd9",
+   "id": "0529e454",
   "metadata": {
    "cell_marker": "\"\"\""
   },
--- a/modules/source/01_tensor/tensor_dev.py
+++ b/modules/source/01_tensor/tensor_dev.py
@@ -316,21 +316,10 @@ class Tensor:
        ### BEGIN SOLUTION
        if isinstance(other, Tensor):
            # Tensor + Tensor: let NumPy handle broadcasting
-            result_data = self.data + other.data
+            return Tensor(self.data + other.data)
        else:
            # Tensor + scalar: NumPy broadcasts automatically
-            result_data = self.data + other
-
-        # Create new tensor with result
-        result = Tensor(result_data)
-
-        # Preserve gradient tracking if either operand requires gradients
-        if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):
-            result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)
-        elif hasattr(self, 'requires_grad'):
-            result.requires_grad = self.requires_grad
-
-        return result
+            return Tensor(self.data + other)
        ### END SOLUTION

    # nbgrader={"grade": false, "grade_id": "more-arithmetic", "solution": true}
@@ -340,10 +329,12 @@ class Tensor:

        Common use: Centering data (x - mean), computing differences for loss functions.
        """
+        ### BEGIN SOLUTION
        if isinstance(other, Tensor):
            return Tensor(self.data - other.data)
        else:
            return Tensor(self.data - other)
+        ### END SOLUTION

    def __mul__(self, other):
        """
@@ -352,10 +343,12 @@ class Tensor:
        Common use: Scaling features, applying masks, gating mechanisms in neural networks.
        Note: This is * operator, not @ (which will be matrix multiplication).
        """
+        ### BEGIN SOLUTION
        if isinstance(other, Tensor):
            return Tensor(self.data * other.data)
        else:
            return Tensor(self.data * other)
+        ### END SOLUTION

    def __truediv__(self, other):
        """
@@ -363,10 +356,12 @@ class Tensor:

        Common use: Normalization (x / std), converting counts to probabilities.
        """
+        ### BEGIN SOLUTION
        if isinstance(other, Tensor):
            return Tensor(self.data / other.data)
        else:
            return Tensor(self.data / other)
+        ### END SOLUTION

    # nbgrader={"grade": false, "grade_id": "matmul-impl", "solution": true}
    def matmul(self, other):
@@ -435,7 +430,8 @@ class Tensor:
                )

        # Perform optimized matrix multiplication
-        result_data = np.dot(self.data, other.data)
+        # Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors
+        result_data = np.matmul(self.data, other.data)
        return Tensor(result_data)
        ### END SOLUTION

@@ -507,7 +503,9 @@ class Tensor:

        # Reshape the data (NumPy handles the memory layout efficiently)
        reshaped_data = np.reshape(self.data, new_shape)
-        return Tensor(reshaped_data)
+        # Preserve gradient tracking from the original tensor (important for autograd!)
+        result = Tensor(reshaped_data, requires_grad=self.requires_grad)
+        return result
        ### END SOLUTION

    def transpose(self, dim0=None, dim1=None):
@@ -573,7 +571,9 @@ class Tensor:
            axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
            transposed_data = np.transpose(self.data, axes)

-        return Tensor(transposed_data)
+        # Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn)
+        result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False)
+        return result
        ### END SOLUTION

    # nbgrader={"grade": false, "grade_id": "reduction-ops", "solution": true}
--- a/tests/regression/test_gradient_flow_fixes.py
+++ b/tests/regression/test_gradient_flow_fixes.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+"""
+Regression Tests for Gradient Flow Fixes
+
+This test suite verifies that specific gradient flow bugs have been fixed and don't regress.
+These tests document the issues we encountered during transformer milestone implementation
+and ensure the fixes remain in place.
+
+Regression Issues Tested:
+1. Module 01: np.dot → np.matmul for batched 3D tensors
+2. Module 01: transpose() preserving requires_grad
+3. Module 05: SubBackward and DivBackward added
+4. Module 02: Softmax using Tensor operations
+5. Module 03: Dropout using Tensor operations  
+6. Module 11: Embedding preserving requires_grad
+7. Module 12: Attention using batched operations (no .data extraction)
+8. Module 13: LayerNorm using Tensor operations
+"""
+
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../..'))
+
+import numpy as np
+from tinytorch.core.tensor import Tensor
+from tinytorch.core.autograd import enable_autograd
+
+# Enable autograd once for all tests
+enable_autograd()
+
+
+def test_regression_batched_matmul():
+    """
+    Regression test for Issue #1: np.dot doesn't handle batched 3D matmul.
+    
+    Bug: Using np.dot for 3D tensors produces wrong shapes.
+    Fix: Changed to np.matmul in modules/source/01_tensor/tensor_dev.py
+    Commit: Module 01 fixes
+    """
+    print("Testing regression: batched 3D matmul...")
+    
+    # This pattern appears in attention: Q @ K.T
+    Q = Tensor(np.random.randn(2, 4, 8), requires_grad=True)
+    K = Tensor(np.random.randn(2, 4, 8), requires_grad=True)
+    K_T = K.transpose()
+    
+    scores = Q.matmul(K_T)
+    
+    # Bug would produce (2, 4, 2, 4) or crash
+    # Fix produces correct (2, 4, 4)
+    assert scores.shape == (2, 4, 4), f"Batched matmul shape regression: {scores.shape}"
+    assert scores.requires_grad, "Batched matmul should preserve requires_grad"
+    
+    print("✅ Batched 3D matmul regression test passed")
+
+
+def test_regression_transpose_requires_grad():
+    """
+    Regression test for Issue #2: transpose() not preserving requires_grad.
+    
+    Bug: x.transpose() created Tensor without requires_grad.
+    Fix: Added requires_grad parameter in modules/source/01_tensor/tensor_dev.py
+    Commit: Module 01 fixes
+    """
+    print("Testing regression: transpose requires_grad...")
+    
+    x = Tensor(np.random.randn(2, 3, 4), requires_grad=True)
+    x_T = x.transpose()
+    
+    # Bug: x_T.requires_grad would be False
+    # Fix: x_T.requires_grad is True
+    assert x_T.requires_grad, "Transpose should preserve requires_grad"
+    
+    print("✅ Transpose requires_grad regression test passed")
+
+
+def test_regression_subtraction_has_backward():
+    """
+    Regression test for Issue #3: Subtraction had no backward pass.
+    
+    Bug: Tensor.__sub__ not patched by Module 05, no gradient flow.
+    Fix: Added SubBackward class and patched __sub__ in Module 05.
+    Commit: Module 05 fixes
+    """
+    print("Testing regression: subtraction backward...")
+    
+    a = Tensor([2.0, 3.0], requires_grad=True)
+    b = Tensor([1.0, 1.0], requires_grad=True)
+    c = a - b
+    
+    # Bug: c._grad_fn would be None
+    # Fix: c._grad_fn is SubBackward instance
+    assert hasattr(c, '_grad_fn'), "Subtraction should have _grad_fn"
+    assert c._grad_fn is not None, "Subtraction _grad_fn should not be None"
+    
+    # Verify backward pass
+    c.backward(np.ones(2))
+    assert a.grad is not None and np.allclose(a.grad, [1.0, 1.0]), "∂(a-b)/∂a = 1"
+    assert b.grad is not None and np.allclose(b.grad, [-1.0, -1.0]), "∂(a-b)/∂b = -1"
+    
+    print("✅ Subtraction backward regression test passed")
+
+
+def test_regression_division_has_backward():
+    """
+    Regression test for Issue #4: Division had no backward pass.
+    
+    Bug: Tensor.__truediv__ not patched by Module 05, no gradient flow.
+    Fix: Added DivBackward class and patched __truediv__ in Module 05.
+    Commit: Module 05 fixes
+    """
+    print("Testing regression: division backward...")
+    
+    a = Tensor([4.0, 6.0], requires_grad=True)
+    b = Tensor([2.0, 2.0], requires_grad=True)
+    c = a / b
+    
+    # Bug: c._grad_fn would be None
+    # Fix: c._grad_fn is DivBackward instance
+    assert hasattr(c, '_grad_fn'), "Division should have _grad_fn"
+    assert c._grad_fn is not None, "Division _grad_fn should not be None"
+    
+    # Verify backward pass
+    c.backward(np.ones(2))
+    assert a.grad is not None and np.allclose(a.grad, [0.5, 0.5]), "∂(a/b)/∂a = 1/b"
+    
+    print("✅ Division backward regression test passed")
+
+
+def test_regression_layernorm_gradient_flow():
+    """
+    Regression test for Issue #5: LayerNorm broke gradient flow.
+    
+    Bug: LayerNorm extracted .data, creating Tensors without _grad_fn.
+    Fix: Rewrote to use Tensor operations in Module 13.
+    Commit: Module 13 fixes
+    """
+    print("Testing regression: LayerNorm gradient flow...")
+    
+    from tinytorch.models.transformer import LayerNorm
+    
+    ln = LayerNorm(4)
+    ln.gamma.requires_grad = True
+    ln.beta.requires_grad = True
+    
+    x = Tensor([[1.0, 2.0, 3.0, 4.0]], requires_grad=True)
+    output = ln.forward(x)
+    
+    # Bug: output.requires_grad would be False or _grad_fn None
+    # Fix: output has requires_grad=True and _grad_fn set
+    assert output.requires_grad, "LayerNorm output should require gradients"
+    assert hasattr(output, '_grad_fn'), "LayerNorm output should have _grad_fn"
+    
+    # Verify backward
+    output.backward(np.ones_like(output.data))
+    assert x.grad is not None, "Gradient should flow back through LayerNorm"
+    
+    print("✅ LayerNorm gradient flow regression test passed")
+
+
+def test_regression_embedding_requires_grad():
+    """
+    Regression test for Issue #6: Embedding didn't preserve requires_grad.
+    
+    Bug: Embedding.forward() created Tensor(embedded) without requires_grad.
+    Fix: Added requires_grad=self.weight.requires_grad in Module 11.
+    Commit: Module 11 fixes
+    """
+    print("Testing regression: Embedding requires_grad...")
+    
+    from tinytorch.text.embeddings import Embedding
+    
+    embed = Embedding(vocab_size=10, embed_dim=8)
+    embed.weight.requires_grad = True
+    
+    indices = Tensor([[1, 2, 3]])
+    output = embed.forward(indices)
+    
+    # Bug: output.requires_grad would be False
+    # Fix: output.requires_grad is True
+    assert output.requires_grad, "Embedding output should preserve requires_grad"
+    
+    print("✅ Embedding requires_grad regression test passed")
+
+
+def test_regression_dropout_uses_tensor_ops():
+    """
+    Regression test for Issue #7: Dropout used .data extraction.
+    
+    Bug: Dropout did (x.data * mask) / keep_prob, breaking gradient flow.
+    Fix: Rewrote to use Tensor operations in Module 03.
+    Commit: Module 03 fixes
+    """
+    print("Testing regression: Dropout Tensor operations...")
+    
+    from tinytorch.core.layers import Dropout
+    
+    dropout = Dropout(0.5)
+    x = Tensor([[1.0, 2.0, 3.0, 4.0]], requires_grad=True)
+    
+    # Set seed for reproducibility
+    np.random.seed(42)
+    output = dropout.forward(x, training=True)
+    
+    # Bug: output wouldn't have _grad_fn
+    # Fix: output has _grad_fn from Tensor multiplication
+    assert output.requires_grad, "Dropout output should require gradients"
+    
+    print("✅ Dropout Tensor operations regression test passed")
+
+
+def run_all_tests():
+    """Run all regression tests for gradient flow fixes."""
+    print("\n" + "="*70)
+    print("GRADIENT FLOW REGRESSION TEST SUITE")
+    print("="*70 + "\n")
+    
+    tests = [
+        test_regression_batched_matmul,
+        test_regression_transpose_requires_grad,
+        test_regression_subtraction_has_backward,
+        test_regression_division_has_backward,
+        test_regression_layernorm_gradient_flow,
+        test_regression_embedding_requires_grad,
+        test_regression_dropout_uses_tensor_ops,
+    ]
+    
+    passed = 0
+    failed = 0
+    
+    for test_func in tests:
+        try:
+            test_func()
+            passed += 1
+        except Exception as e:
+            print(f"❌ {test_func.__name__} FAILED: {e}")
+            import traceback
+            traceback.print_exc()
+            failed += 1
+        print("")
+    
+    print("="*70)
+    print(f"RESULTS: {passed} passed, {failed} failed")
+    if failed == 0:
+        print("✅ All gradient flow fixes verified - no regressions detected!")
+    print("="*70)
+    
+    return failed == 0
+
+
+if __name__ == "__main__":
+    success = run_all_tests()
+    sys.exit(0 if success else 1)
+
--- a/tinytorch/core/tensor.py
+++ b/tinytorch/core/tensor.py
@@ -113,21 +113,10 @@ class Tensor:
        ### BEGIN SOLUTION
        if isinstance(other, Tensor):
            # Tensor + Tensor: let NumPy handle broadcasting
-            result_data = self.data + other.data
+            return Tensor(self.data + other.data)
        else:
            # Tensor + scalar: NumPy broadcasts automatically
-            result_data = self.data + other
-
-        # Create new tensor with result
-        result = Tensor(result_data)
-
-        # Preserve gradient tracking if either operand requires gradients
-        if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):
-            result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)
-        elif hasattr(self, 'requires_grad'):
-            result.requires_grad = self.requires_grad
-
-        return result
+            return Tensor(self.data + other)
        ### END SOLUTION

    # nbgrader={"grade": false, "grade_id": "more-arithmetic", "solution": true}
@@ -137,10 +126,12 @@ class Tensor:

        Common use: Centering data (x - mean), computing differences for loss functions.
        """
+        ### BEGIN SOLUTION
        if isinstance(other, Tensor):
            return Tensor(self.data - other.data)
        else:
            return Tensor(self.data - other)
+        ### END SOLUTION

    def __mul__(self, other):
        """
@@ -149,10 +140,12 @@ class Tensor:
        Common use: Scaling features, applying masks, gating mechanisms in neural networks.
        Note: This is * operator, not @ (which will be matrix multiplication).
        """
+        ### BEGIN SOLUTION
        if isinstance(other, Tensor):
            return Tensor(self.data * other.data)
        else:
            return Tensor(self.data * other)
+        ### END SOLUTION

    def __truediv__(self, other):
        """
@@ -160,10 +153,12 @@ class Tensor:

        Common use: Normalization (x / std), converting counts to probabilities.
        """
+        ### BEGIN SOLUTION
        if isinstance(other, Tensor):
            return Tensor(self.data / other.data)
        else:
            return Tensor(self.data / other)
+        ### END SOLUTION

    # nbgrader={"grade": false, "grade_id": "matmul-impl", "solution": true}
    def matmul(self, other):
@@ -232,7 +227,8 @@ class Tensor:
                )

        # Perform optimized matrix multiplication
-        result_data = np.dot(self.data, other.data)
+        # Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors
+        result_data = np.matmul(self.data, other.data)
        return Tensor(result_data)
        ### END SOLUTION

@@ -304,7 +300,9 @@ class Tensor:

        # Reshape the data (NumPy handles the memory layout efficiently)
        reshaped_data = np.reshape(self.data, new_shape)
-        return Tensor(reshaped_data)
+        # Preserve gradient tracking from the original tensor (important for autograd!)
+        result = Tensor(reshaped_data, requires_grad=self.requires_grad)
+        return result
        ### END SOLUTION

    def transpose(self, dim0=None, dim1=None):
@@ -370,7 +368,9 @@ class Tensor:
            axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
            transposed_data = np.transpose(self.data, axes)

-        return Tensor(transposed_data)
+        # Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn)
+        result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False)
+        return result
        ### END SOLUTION

    # nbgrader={"grade": false, "grade_id": "reduction-ops", "solution": true}