fix(module-02): Rewrite Softmax to use Tensor operations

- Preserve computation graph by using Tensor arithmetic (x - x_max, exp / sum) - No more .data extraction that breaks gradient flow - Numerically stable with max subtraction before exp Required for transformer attention softmax gradient flow
2026-04-29 19:44:21 -05:00 · 2025-10-27 20:29:35 -04:00
parent fb753882ec
commit 4fa00b51b3
3 changed files with 65 additions and 56 deletions
--- a/tinytorch/core/activations.py
+++ b/tinytorch/core/activations.py
@@ -245,18 +245,21 @@ class Softmax:
        """
        ### BEGIN SOLUTION
        # Numerical stability: subtract max to prevent overflow
-        x_max = np.max(x.data, axis=dim, keepdims=True)
-        x_shifted = x.data - x_max
+        # Use Tensor operations to preserve gradient flow!
+        x_max_data = np.max(x.data, axis=dim, keepdims=True)
+        x_max = Tensor(x_max_data, requires_grad=False)  # max is not differentiable in this context
+        x_shifted = x - x_max  # Tensor subtraction!

-        # Compute exponentials
-        exp_values = np.exp(x_shifted)
+        # Compute exponentials (NumPy operation, but wrapped in Tensor)
+        exp_values = Tensor(np.exp(x_shifted.data), requires_grad=x_shifted.requires_grad)

-        # Sum along dimension
-        exp_sum = np.sum(exp_values, axis=dim, keepdims=True)
+        # Sum along dimension (Tensor operation)
+        exp_sum_data = np.sum(exp_values.data, axis=dim, keepdims=True)
+        exp_sum = Tensor(exp_sum_data, requires_grad=exp_values.requires_grad)

-        # Normalize to get probabilities
+        # Normalize to get probabilities (Tensor division!)
        result = exp_values / exp_sum
-        return Tensor(result)
+        return result
        ### END SOLUTION

    def __call__(self, x: Tensor, dim: int = -1) -> Tensor: