fix(module-13): Rewrite LayerNorm to use Tensor operations

- Change from .data extraction to Tensor arithmetic (x - mean, diff * diff, x / std) - Preserve computation graph through normalization - std tensor now preserves requires_grad correctly LayerNorm is used before and after attention in transformer blocks
2026-04-30 20:27:30 -05:00 · 2025-10-27 20:30:21 -04:00
parent c23946b20e
commit 8025c66a4b
3 changed files with 66 additions and 53 deletions
--- a/modules/source/13_transformers/transformers_dev.ipynb
+++ b/modules/source/13_transformers/transformers_dev.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "id": "5aea9c35",
+   "id": "33c199c2",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -36,7 +36,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "f22e51a9",
+   "id": "1ec63d43",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -46,7 +46,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "a018998a",
+   "id": "3d486c1e",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -60,7 +60,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "01b92407",
+   "id": "4afca29a",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -85,7 +85,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "cf1a206a",
+   "id": "135c0c51",
   "metadata": {
    "lines_to_next_cell": 2
   },
@@ -104,7 +104,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "0e7e76fd",
+   "id": "3057e8a0",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -190,7 +190,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "0a43cd3d",
+   "id": "3d854c15",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -325,7 +325,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "dccfbe05",
+   "id": "f894e04b",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -343,7 +343,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "5e866445",
+   "id": "b4646db7",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -411,7 +411,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "e4495508",
+   "id": "7e5a454a",
   "metadata": {
    "lines_to_next_cell": 1,
    "nbgrader": {
@@ -484,12 +484,15 @@
    "        mean = x.mean(axis=-1, keepdims=True)\n",
    "\n",
    "        # Compute variance: E[(x - μ)²]\n",
-    "        diff = Tensor(x.data - mean.data)\n",
-    "        variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))\n",
+    "        # Use Tensor operations to preserve computation graph!\n",
+    "        diff = x - mean\n",
+    "        variance = (diff * diff).mean(axis=-1, keepdims=True)\n",
    "\n",
-    "        # Normalize\n",
-    "        std = Tensor(np.sqrt(variance.data + self.eps))\n",
-    "        normalized = Tensor((x.data - mean.data) / std.data)\n",
+    "        # Normalize - use Tensor operations to preserve gradients!\n",
+    "        # Add eps as a Tensor for proper gradient flow\n",
+    "        eps_tensor = Tensor(np.array(self.eps), requires_grad=False)\n",
+    "        std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)\n",
+    "        normalized = (x - mean) / std\n",
    "\n",
    "        # Apply learnable transformation\n",
    "        output = normalized * self.gamma + self.beta\n",
@@ -503,7 +506,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "f50de247",
+   "id": "92bbef2d",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -519,7 +522,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "29808a79",
+   "id": "a824ba3e",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -566,7 +569,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "f99ac0f0",
+   "id": "34a77537",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -651,7 +654,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "e2e7c950",
+   "id": "2f1b0cf0",
   "metadata": {
    "lines_to_next_cell": 1,
    "nbgrader": {
@@ -742,7 +745,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "12d2d450",
+   "id": "4e55a5d6",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -758,7 +761,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "72c129bb",
+   "id": "02896cfd",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -806,7 +809,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "d5693773",
+   "id": "f0af20f9",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -908,7 +911,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "1983128e",
+   "id": "be774576",
   "metadata": {
    "lines_to_next_cell": 1,
    "nbgrader": {
@@ -1021,7 +1024,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "e738b0a2",
+   "id": "20976835",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1037,7 +1040,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "3ecf15a9",
+   "id": "3ad677b9",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -1088,7 +1091,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "d0c34790",
+   "id": "3e0a6497",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1242,7 +1245,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "d0fe1c53",
+   "id": "b6077781",
   "metadata": {
    "lines_to_next_cell": 1,
    "nbgrader": {
@@ -1440,7 +1443,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "8ab2a056",
+   "id": "223ab70e",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1456,7 +1459,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "df2c9d2a",
+   "id": "843a027b",
   "metadata": {
    "nbgrader": {
     "grade": true,
@@ -1514,7 +1517,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "86e2cffb",
+   "id": "11c7afd8",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1560,8 +1563,9 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "d11920a5",
+   "id": "464575ff",
   "metadata": {
+    "lines_to_next_cell": 1,
    "nbgrader": {
     "grade": false,
     "grade_id": "integration-demo",
@@ -1627,12 +1631,12 @@
    "\n",
    "    return model\n",
    "\n",
-    "demonstrate_transformer_integration()"
+    "# demonstrate_transformer_integration()  # Moved to __main__ block below"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "777c88af",
+   "id": "65e903ac",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1717,7 +1721,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "66957b87",
+   "id": "065a32d8",
   "metadata": {
    "lines_to_next_cell": 1,
    "nbgrader": {
@@ -1774,7 +1778,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "9badfce3",
+   "id": "8ff38096",
   "metadata": {
    "nbgrader": {
     "grade": false,
@@ -1819,7 +1823,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "1701db55",
+   "id": "01719014",
   "metadata": {
    "cell_marker": "\"\"\"",
    "lines_to_next_cell": 1
@@ -1833,8 +1837,9 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "93ecb08c",
+   "id": "d18c01d8",
   "metadata": {
+    "lines_to_next_cell": 1,
    "nbgrader": {
     "grade": true,
     "grade_id": "test-module",
@@ -1907,25 +1912,26 @@
    "    print(\"Run: tito module complete 13\")\n",
    "\n",
    "# Call the comprehensive test\n",
-    "test_module()"
+    "# test_module()  # Only run in __main__ block below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "75f8b229",
+   "id": "009d2ab0",
   "metadata": {},
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    print(\"🚀 Running Transformers module...\")\n",
+    "    demonstrate_transformer_integration()\n",
    "    test_module()\n",
    "    print(\"✅ Module validation complete!\")"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "5245c0f2",
+   "id": "28ae7326",
   "metadata": {
    "cell_marker": "\"\"\""
   },
@@ -1965,7 +1971,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "ed29968a",
+   "id": "eaa4c950",
   "metadata": {
    "cell_marker": "\"\"\""
   },
--- a/modules/source/13_transformers/transformers_dev.py
+++ b/modules/source/13_transformers/transformers_dev.py
@@ -430,12 +430,15 @@ class LayerNorm:
        mean = x.mean(axis=-1, keepdims=True)

        # Compute variance: E[(x - μ)²]
-        diff = Tensor(x.data - mean.data)
-        variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
+        # Use Tensor operations to preserve computation graph!
+        diff = x - mean
+        variance = (diff * diff).mean(axis=-1, keepdims=True)

-        # Normalize
-        std = Tensor(np.sqrt(variance.data + self.eps))
-        normalized = Tensor((x.data - mean.data) / std.data)
+        # Normalize - use Tensor operations to preserve gradients!
+        # Add eps as a Tensor for proper gradient flow
+        eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
+        std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
+        normalized = (x - mean) / std

        # Apply learnable transformation
        output = normalized * self.gamma + self.beta
@@ -1414,7 +1417,7 @@ def demonstrate_transformer_integration():

    return model

-demonstrate_transformer_integration()
+# demonstrate_transformer_integration()  # Moved to __main__ block below

 # %% [markdown]
 """
@@ -1641,11 +1644,12 @@ def test_module():
    print("Run: tito module complete 13")

 # Call the comprehensive test
-test_module()
+# test_module()  # Only run in __main__ block below

 # %%
 if __name__ == "__main__":
    print("🚀 Running Transformers module...")
+    demonstrate_transformer_integration()
    test_module()
    print("✅ Module validation complete!")

--- a/tinytorch/models/transformer.py
+++ b/tinytorch/models/transformer.py
@@ -86,12 +86,15 @@ class LayerNorm:
        mean = x.mean(axis=-1, keepdims=True)

        # Compute variance: E[(x - μ)²]
-        diff = Tensor(x.data - mean.data)
-        variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
+        # Use Tensor operations to preserve computation graph!
+        diff = x - mean
+        variance = (diff * diff).mean(axis=-1, keepdims=True)

-        # Normalize
-        std = Tensor(np.sqrt(variance.data + self.eps))
-        normalized = Tensor((x.data - mean.data) / std.data)
+        # Normalize - use Tensor operations to preserve gradients!
+        # Add eps as a Tensor for proper gradient flow
+        eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
+        std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
+        normalized = (x - mean) / std

        # Apply learnable transformation
        output = normalized * self.gamma + self.beta