From 8cff435db91592e416ddb5a140ebe5fcefb41e6d Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Mon, 27 Oct 2025 20:30:03 -0400
Subject: [PATCH] fix(module-11): Fix Embedding and PositionalEncoding gradient
 flow

- Embedding.forward() now preserves requires_grad from weight tensor
- PositionalEncoding.forward() uses Tensor addition (x + pos) instead of .data
- Critical for transformer input embeddings to have gradients

Both changes ensure gradient flows from loss back to embedding weights
---
 .../source/11_embeddings/embeddings_dev.ipynb | 76 ++++++++++---------
 .../source/11_embeddings/embeddings_dev.py    | 18 +++--
 tinytorch/text/embeddings.py                  | 18 +++--
 3 files changed, 62 insertions(+), 50 deletions(-)

diff --git a/modules/source/11_embeddings/embeddings_dev.ipynb b/modules/source/11_embeddings/embeddings_dev.ipynb
index 20d326c7..f92733af 100644
--- a/modules/source/11_embeddings/embeddings_dev.ipynb
+++ b/modules/source/11_embeddings/embeddings_dev.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "7d9a210d",
+   "id": "4d9dd3b6",
    "metadata": {
     "cell_marker": "\"\"\""
    },
@@ -51,7 +51,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9e8440ef",
+   "id": "deb6c0c5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,7 +61,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "70d81596",
+   "id": "98db1095",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1ddbc881",
+   "id": "0e8eb8a3",
    "metadata": {
     "cell_marker": "\"\"\""
    },
@@ -133,7 +133,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f1e4bdc9",
+   "id": "d06e3f22",
    "metadata": {
     "cell_marker": "\"\"\""
    },
@@ -239,7 +239,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b0e68de7",
+   "id": "f6ddc76a",
    "metadata": {
     "cell_marker": "\"\"\"",
     "lines_to_next_cell": 1
@@ -253,7 +253,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "19fc003f",
+   "id": "de174193",
    "metadata": {
     "lines_to_next_cell": 1,
     "nbgrader": {
@@ -334,7 +334,8 @@
     "        # This is equivalent to one-hot multiplication but much more efficient\n",
     "        embedded = self.weight.data[indices.data.astype(int)]\n",
     "\n",
-    "        return Tensor(embedded)\n",
+    "        # Preserve requires_grad so autograd can track this operation!\n",
+    "        return Tensor(embedded, requires_grad=self.weight.requires_grad)\n",
     "\n",
     "    def parameters(self) -> List[Tensor]:\n",
     "        \"\"\"Return trainable parameters.\"\"\"\n",
@@ -348,7 +349,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "17498acf",
+   "id": "272fcf53",
    "metadata": {
     "nbgrader": {
      "grade": true,
@@ -398,7 +399,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2aeb2910",
+   "id": "2f331a75",
    "metadata": {
     "cell_marker": "\"\"\""
    },
@@ -437,7 +438,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2d67a811",
+   "id": "3d5a68f9",
    "metadata": {
     "cell_marker": "\"\"\"",
     "lines_to_next_cell": 1
@@ -451,7 +452,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b82481b4",
+   "id": "2020ceda",
    "metadata": {
     "lines_to_next_cell": 1,
     "nbgrader": {
@@ -537,16 +538,19 @@
     "                f\"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}\"\n",
     "            )\n",
     "\n",
-    "        # Get position embeddings for this sequence length\n",
-    "        pos_embeddings = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)\n",
+    "        # Get position embeddings for this sequence length (slice using .data for efficiency)\n",
+    "        pos_embeddings_data = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)\n",
     "\n",
     "        # Broadcast to match batch dimension: (1, seq_len, embed_dim)\n",
-    "        pos_embeddings = pos_embeddings[np.newaxis, :, :]\n",
+    "        pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :]\n",
+    "        \n",
+    "        # Wrap in Tensor to preserve requires_grad\n",
+    "        pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad)\n",
     "\n",
-    "        # Add positional information to input embeddings\n",
-    "        result = x.data + pos_embeddings\n",
+    "        # Add positional information using Tensor operation to preserve gradients!\n",
+    "        result = x + pos_embeddings\n",
     "\n",
-    "        return Tensor(result)\n",
+    "        return result\n",
     "\n",
     "    def parameters(self) -> List[Tensor]:\n",
     "        \"\"\"Return trainable parameters.\"\"\"\n",
@@ -560,7 +564,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c9f01de4",
+   "id": "61ad6469",
    "metadata": {
     "nbgrader": {
      "grade": true,
@@ -616,7 +620,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "809099dc",
+   "id": "64bb6901",
    "metadata": {
     "cell_marker": "\"\"\""
    },
@@ -684,7 +688,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6fa0a064",
+   "id": "b3bc691b",
    "metadata": {
     "cell_marker": "\"\"\"",
     "lines_to_next_cell": 1
@@ -698,7 +702,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "825a0299",
+   "id": "58986735",
    "metadata": {
     "lines_to_next_cell": 1,
     "nbgrader": {
@@ -773,7 +777,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "440952af",
+   "id": "61cbfa9a",
    "metadata": {
     "nbgrader": {
      "grade": true,
@@ -830,7 +834,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6ffbaced",
+   "id": "8b3d547c",
    "metadata": {
     "cell_marker": "\"\"\""
    },
@@ -850,7 +854,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e25bf119",
+   "id": "46e78a86",
    "metadata": {
     "cell_marker": "\"\"\"",
     "lines_to_next_cell": 1
@@ -927,7 +931,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2f033b57",
+   "id": "cdc0ae73",
    "metadata": {
     "lines_to_next_cell": 1,
     "nbgrader": {
@@ -1075,7 +1079,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "72670535",
+   "id": "ee80b3bd",
    "metadata": {
     "nbgrader": {
      "grade": true,
@@ -1164,7 +1168,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "27c9db3b",
+   "id": "55747d44",
    "metadata": {
     "cell_marker": "\"\"\"",
     "lines_to_next_cell": 1
@@ -1178,7 +1182,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d05c51c1",
+   "id": "18fc3b94",
    "metadata": {
     "lines_to_next_cell": 1,
     "nbgrader": {
@@ -1238,7 +1242,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1b96f57b",
+   "id": "1de57928",
    "metadata": {
     "lines_to_next_cell": 1,
     "nbgrader": {
@@ -1305,7 +1309,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "62edc85a",
+   "id": "4f1eb570",
    "metadata": {
     "nbgrader": {
      "grade": false,
@@ -1388,7 +1392,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ffcbbfe8",
+   "id": "b6212e7a",
    "metadata": {
     "cell_marker": "\"\"\"",
     "lines_to_next_cell": 1
@@ -1402,7 +1406,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9a0587e9",
+   "id": "a0927e45",
    "metadata": {
     "lines_to_next_cell": 1,
     "nbgrader": {
@@ -1542,7 +1546,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0ed96c77",
+   "id": "363e02c3",
    "metadata": {
     "nbgrader": {
      "grade": false,
@@ -1561,7 +1565,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a6aff95f",
+   "id": "4ddd0c51",
    "metadata": {
     "cell_marker": "\"\"\""
    },
@@ -1595,7 +1599,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6b073262",
+   "id": "efedf22b",
    "metadata": {
     "cell_marker": "\"\"\""
    },
diff --git a/modules/source/11_embeddings/embeddings_dev.py b/modules/source/11_embeddings/embeddings_dev.py
index c30b50d2..15751c93 100644
--- a/modules/source/11_embeddings/embeddings_dev.py
+++ b/modules/source/11_embeddings/embeddings_dev.py
@@ -298,7 +298,8 @@ class Embedding:
         # This is equivalent to one-hot multiplication but much more efficient
         embedded = self.weight.data[indices.data.astype(int)]
 
-        return Tensor(embedded)
+        # Preserve requires_grad so autograd can track this operation!
+        return Tensor(embedded, requires_grad=self.weight.requires_grad)
 
     def parameters(self) -> List[Tensor]:
         """Return trainable parameters."""
@@ -462,16 +463,19 @@ class PositionalEncoding:
                 f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
             )
 
-        # Get position embeddings for this sequence length
-        pos_embeddings = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)
+        # Get position embeddings for this sequence length (slice using .data for efficiency)
+        pos_embeddings_data = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)
 
         # Broadcast to match batch dimension: (1, seq_len, embed_dim)
-        pos_embeddings = pos_embeddings[np.newaxis, :, :]
+        pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :]
+        
+        # Wrap in Tensor to preserve requires_grad
+        pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad)
 
-        # Add positional information to input embeddings
-        result = x.data + pos_embeddings
+        # Add positional information using Tensor operation to preserve gradients!
+        result = x + pos_embeddings
 
-        return Tensor(result)
+        return result
 
     def parameters(self) -> List[Tensor]:
         """Return trainable parameters."""
diff --git a/tinytorch/text/embeddings.py b/tinytorch/text/embeddings.py
index b71d7c4c..899a5fc6 100644
--- a/tinytorch/text/embeddings.py
+++ b/tinytorch/text/embeddings.py
@@ -95,7 +95,8 @@ class Embedding:
         # This is equivalent to one-hot multiplication but much more efficient
         embedded = self.weight.data[indices.data.astype(int)]
 
-        return Tensor(embedded)
+        # Preserve requires_grad so autograd can track this operation!
+        return Tensor(embedded, requires_grad=self.weight.requires_grad)
 
     def parameters(self) -> List[Tensor]:
         """Return trainable parameters."""
@@ -180,16 +181,19 @@ class PositionalEncoding:
                 f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
             )
 
-        # Get position embeddings for this sequence length
-        pos_embeddings = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)
+        # Get position embeddings for this sequence length (slice using .data for efficiency)
+        pos_embeddings_data = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)
 
         # Broadcast to match batch dimension: (1, seq_len, embed_dim)
-        pos_embeddings = pos_embeddings[np.newaxis, :, :]
+        pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :]
+        
+        # Wrap in Tensor to preserve requires_grad
+        pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad)
 
-        # Add positional information to input embeddings
-        result = x.data + pos_embeddings
+        # Add positional information using Tensor operation to preserve gradients!
+        result = x + pos_embeddings
 
-        return Tensor(result)
+        return result
 
     def parameters(self) -> List[Tensor]:
         """Return trainable parameters."""