fix(module-13): Rewrite LayerNorm to use Tensor operations

- Change from .data extraction to Tensor arithmetic (x - mean, diff * diff, x / std)
- Preserve computation graph through normalization
- std tensor now preserves requires_grad correctly

LayerNorm is used before and after attention in transformer blocks
This commit is contained in:
Vijay Janapa Reddi
2025-10-27 20:30:21 -04:00
parent c23946b20e
commit 8025c66a4b
3 changed files with 66 additions and 53 deletions

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"id": "5aea9c35",
"id": "33c199c2",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -36,7 +36,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "f22e51a9",
"id": "1ec63d43",
"metadata": {},
"outputs": [],
"source": [
@@ -46,7 +46,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "a018998a",
"id": "3d486c1e",
"metadata": {},
"outputs": [],
"source": [
@@ -60,7 +60,7 @@
},
{
"cell_type": "markdown",
"id": "01b92407",
"id": "4afca29a",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -85,7 +85,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "cf1a206a",
"id": "135c0c51",
"metadata": {
"lines_to_next_cell": 2
},
@@ -104,7 +104,7 @@
},
{
"cell_type": "markdown",
"id": "0e7e76fd",
"id": "3057e8a0",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -190,7 +190,7 @@
},
{
"cell_type": "markdown",
"id": "0a43cd3d",
"id": "3d854c15",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -325,7 +325,7 @@
},
{
"cell_type": "markdown",
"id": "dccfbe05",
"id": "f894e04b",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -343,7 +343,7 @@
},
{
"cell_type": "markdown",
"id": "5e866445",
"id": "b4646db7",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -411,7 +411,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "e4495508",
"id": "7e5a454a",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -484,12 +484,15 @@
" mean = x.mean(axis=-1, keepdims=True)\n",
"\n",
" # Compute variance: E[(x - μ)²]\n",
" diff = Tensor(x.data - mean.data)\n",
" variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))\n",
" # Use Tensor operations to preserve computation graph!\n",
" diff = x - mean\n",
" variance = (diff * diff).mean(axis=-1, keepdims=True)\n",
"\n",
" # Normalize\n",
" std = Tensor(np.sqrt(variance.data + self.eps))\n",
" normalized = Tensor((x.data - mean.data) / std.data)\n",
" # Normalize - use Tensor operations to preserve gradients!\n",
" # Add eps as a Tensor for proper gradient flow\n",
" eps_tensor = Tensor(np.array(self.eps), requires_grad=False)\n",
" std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)\n",
" normalized = (x - mean) / std\n",
"\n",
" # Apply learnable transformation\n",
" output = normalized * self.gamma + self.beta\n",
@@ -503,7 +506,7 @@
},
{
"cell_type": "markdown",
"id": "f50de247",
"id": "92bbef2d",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -519,7 +522,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "29808a79",
"id": "a824ba3e",
"metadata": {
"nbgrader": {
"grade": true,
@@ -566,7 +569,7 @@
},
{
"cell_type": "markdown",
"id": "f99ac0f0",
"id": "34a77537",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -651,7 +654,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "e2e7c950",
"id": "2f1b0cf0",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -742,7 +745,7 @@
},
{
"cell_type": "markdown",
"id": "12d2d450",
"id": "4e55a5d6",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -758,7 +761,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "72c129bb",
"id": "02896cfd",
"metadata": {
"nbgrader": {
"grade": true,
@@ -806,7 +809,7 @@
},
{
"cell_type": "markdown",
"id": "d5693773",
"id": "f0af20f9",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -908,7 +911,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "1983128e",
"id": "be774576",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -1021,7 +1024,7 @@
},
{
"cell_type": "markdown",
"id": "e738b0a2",
"id": "20976835",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1037,7 +1040,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "3ecf15a9",
"id": "3ad677b9",
"metadata": {
"nbgrader": {
"grade": true,
@@ -1088,7 +1091,7 @@
},
{
"cell_type": "markdown",
"id": "d0c34790",
"id": "3e0a6497",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1242,7 +1245,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "d0fe1c53",
"id": "b6077781",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -1440,7 +1443,7 @@
},
{
"cell_type": "markdown",
"id": "8ab2a056",
"id": "223ab70e",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1456,7 +1459,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "df2c9d2a",
"id": "843a027b",
"metadata": {
"nbgrader": {
"grade": true,
@@ -1514,7 +1517,7 @@
},
{
"cell_type": "markdown",
"id": "86e2cffb",
"id": "11c7afd8",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1560,8 +1563,9 @@
{
"cell_type": "code",
"execution_count": null,
"id": "d11920a5",
"id": "464575ff",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
"grade": false,
"grade_id": "integration-demo",
@@ -1627,12 +1631,12 @@
"\n",
" return model\n",
"\n",
"demonstrate_transformer_integration()"
"# demonstrate_transformer_integration() # Moved to __main__ block below"
]
},
{
"cell_type": "markdown",
"id": "777c88af",
"id": "65e903ac",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1717,7 +1721,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "66957b87",
"id": "065a32d8",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -1774,7 +1778,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "9badfce3",
"id": "8ff38096",
"metadata": {
"nbgrader": {
"grade": false,
@@ -1819,7 +1823,7 @@
},
{
"cell_type": "markdown",
"id": "1701db55",
"id": "01719014",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1833,8 +1837,9 @@
{
"cell_type": "code",
"execution_count": null,
"id": "93ecb08c",
"id": "d18c01d8",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
"grade": true,
"grade_id": "test-module",
@@ -1907,25 +1912,26 @@
" print(\"Run: tito module complete 13\")\n",
"\n",
"# Call the comprehensive test\n",
"test_module()"
"# test_module() # Only run in __main__ block below"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75f8b229",
"id": "009d2ab0",
"metadata": {},
"outputs": [],
"source": [
"if __name__ == \"__main__\":\n",
" print(\"🚀 Running Transformers module...\")\n",
" demonstrate_transformer_integration()\n",
" test_module()\n",
" print(\"✅ Module validation complete!\")"
]
},
{
"cell_type": "markdown",
"id": "5245c0f2",
"id": "28ae7326",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -1965,7 +1971,7 @@
},
{
"cell_type": "markdown",
"id": "ed29968a",
"id": "eaa4c950",
"metadata": {
"cell_marker": "\"\"\""
},

View File

@@ -430,12 +430,15 @@ class LayerNorm:
mean = x.mean(axis=-1, keepdims=True)
# Compute variance: E[(x - μ)²]
diff = Tensor(x.data - mean.data)
variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
# Use Tensor operations to preserve computation graph!
diff = x - mean
variance = (diff * diff).mean(axis=-1, keepdims=True)
# Normalize
std = Tensor(np.sqrt(variance.data + self.eps))
normalized = Tensor((x.data - mean.data) / std.data)
# Normalize - use Tensor operations to preserve gradients!
# Add eps as a Tensor for proper gradient flow
eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
normalized = (x - mean) / std
# Apply learnable transformation
output = normalized * self.gamma + self.beta
@@ -1414,7 +1417,7 @@ def demonstrate_transformer_integration():
return model
demonstrate_transformer_integration()
# demonstrate_transformer_integration() # Moved to __main__ block below
# %% [markdown]
"""
@@ -1641,11 +1644,12 @@ def test_module():
print("Run: tito module complete 13")
# Call the comprehensive test
test_module()
# test_module() # Only run in __main__ block below
# %%
if __name__ == "__main__":
print("🚀 Running Transformers module...")
demonstrate_transformer_integration()
test_module()
print("✅ Module validation complete!")

View File

@@ -86,12 +86,15 @@ class LayerNorm:
mean = x.mean(axis=-1, keepdims=True)
# Compute variance: E[(x - μ)²]
diff = Tensor(x.data - mean.data)
variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
# Use Tensor operations to preserve computation graph!
diff = x - mean
variance = (diff * diff).mean(axis=-1, keepdims=True)
# Normalize
std = Tensor(np.sqrt(variance.data + self.eps))
normalized = Tensor((x.data - mean.data) / std.data)
# Normalize - use Tensor operations to preserve gradients!
# Add eps as a Tensor for proper gradient flow
eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
normalized = (x - mean) / std
# Apply learnable transformation
output = normalized * self.gamma + self.beta