diff --git a/modules/source/13_transformers/transformers_dev.ipynb b/modules/source/13_transformers/transformers_dev.ipynb index bd7c7733..f2a812b8 100644 --- a/modules/source/13_transformers/transformers_dev.ipynb +++ b/modules/source/13_transformers/transformers_dev.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "5aea9c35", + "id": "33c199c2", "metadata": { "cell_marker": "\"\"\"" }, @@ -36,7 +36,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f22e51a9", + "id": "1ec63d43", "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a018998a", + "id": "3d486c1e", "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ }, { "cell_type": "markdown", - "id": "01b92407", + "id": "4afca29a", "metadata": { "cell_marker": "\"\"\"" }, @@ -85,7 +85,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf1a206a", + "id": "135c0c51", "metadata": { "lines_to_next_cell": 2 }, @@ -104,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "0e7e76fd", + "id": "3057e8a0", "metadata": { "cell_marker": "\"\"\"" }, @@ -190,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "0a43cd3d", + "id": "3d854c15", "metadata": { "cell_marker": "\"\"\"" }, @@ -325,7 +325,7 @@ }, { "cell_type": "markdown", - "id": "dccfbe05", + "id": "f894e04b", "metadata": { "cell_marker": "\"\"\"" }, @@ -343,7 +343,7 @@ }, { "cell_type": "markdown", - "id": "5e866445", + "id": "b4646db7", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -411,7 +411,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e4495508", + "id": "7e5a454a", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -484,12 +484,15 @@ " mean = x.mean(axis=-1, keepdims=True)\n", "\n", " # Compute variance: E[(x - μ)²]\n", - " diff = Tensor(x.data - mean.data)\n", - " variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))\n", + " # Use Tensor operations to preserve computation graph!\n", + " diff = x - mean\n", + " variance = (diff * diff).mean(axis=-1, keepdims=True)\n", "\n", - " # Normalize\n", - " std = Tensor(np.sqrt(variance.data + self.eps))\n", - " normalized = Tensor((x.data - mean.data) / std.data)\n", + " # Normalize - use Tensor operations to preserve gradients!\n", + " # Add eps as a Tensor for proper gradient flow\n", + " eps_tensor = Tensor(np.array(self.eps), requires_grad=False)\n", + " std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)\n", + " normalized = (x - mean) / std\n", "\n", " # Apply learnable transformation\n", " output = normalized * self.gamma + self.beta\n", @@ -503,7 +506,7 @@ }, { "cell_type": "markdown", - "id": "f50de247", + "id": "92bbef2d", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -519,7 +522,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29808a79", + "id": "a824ba3e", "metadata": { "nbgrader": { "grade": true, @@ -566,7 +569,7 @@ }, { "cell_type": "markdown", - "id": "f99ac0f0", + "id": "34a77537", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -651,7 +654,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e2e7c950", + "id": "2f1b0cf0", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -742,7 +745,7 @@ }, { "cell_type": "markdown", - "id": "12d2d450", + "id": "4e55a5d6", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -758,7 +761,7 @@ { "cell_type": "code", "execution_count": null, - "id": "72c129bb", + "id": "02896cfd", "metadata": { "nbgrader": { "grade": true, @@ -806,7 +809,7 @@ }, { "cell_type": "markdown", - "id": "d5693773", + "id": "f0af20f9", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -908,7 +911,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1983128e", + "id": "be774576", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1021,7 +1024,7 @@ }, { "cell_type": "markdown", - "id": "e738b0a2", + "id": "20976835", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1037,7 +1040,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3ecf15a9", + "id": "3ad677b9", "metadata": { "nbgrader": { "grade": true, @@ -1088,7 +1091,7 @@ }, { "cell_type": "markdown", - "id": "d0c34790", + "id": "3e0a6497", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1242,7 +1245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d0fe1c53", + "id": "b6077781", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1440,7 +1443,7 @@ }, { "cell_type": "markdown", - "id": "8ab2a056", + "id": "223ab70e", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1456,7 +1459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "df2c9d2a", + "id": "843a027b", "metadata": { "nbgrader": { "grade": true, @@ -1514,7 +1517,7 @@ }, { "cell_type": "markdown", - "id": "86e2cffb", + "id": "11c7afd8", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1560,8 +1563,9 @@ { "cell_type": "code", "execution_count": null, - "id": "d11920a5", + "id": "464575ff", "metadata": { + "lines_to_next_cell": 1, "nbgrader": { "grade": false, "grade_id": "integration-demo", @@ -1627,12 +1631,12 @@ "\n", " return model\n", "\n", - "demonstrate_transformer_integration()" + "# demonstrate_transformer_integration() # Moved to __main__ block below" ] }, { "cell_type": "markdown", - "id": "777c88af", + "id": "65e903ac", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1717,7 +1721,7 @@ { "cell_type": "code", "execution_count": null, - "id": "66957b87", + "id": "065a32d8", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1774,7 +1778,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9badfce3", + "id": "8ff38096", "metadata": { "nbgrader": { "grade": false, @@ -1819,7 +1823,7 @@ }, { "cell_type": "markdown", - "id": "1701db55", + "id": "01719014", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1833,8 +1837,9 @@ { "cell_type": "code", "execution_count": null, - "id": "93ecb08c", + "id": "d18c01d8", "metadata": { + "lines_to_next_cell": 1, "nbgrader": { "grade": true, "grade_id": "test-module", @@ -1907,25 +1912,26 @@ " print(\"Run: tito module complete 13\")\n", "\n", "# Call the comprehensive test\n", - "test_module()" + "# test_module() # Only run in __main__ block below" ] }, { "cell_type": "code", "execution_count": null, - "id": "75f8b229", + "id": "009d2ab0", "metadata": {}, "outputs": [], "source": [ "if __name__ == \"__main__\":\n", " print(\"🚀 Running Transformers module...\")\n", + " demonstrate_transformer_integration()\n", " test_module()\n", " print(\"✅ Module validation complete!\")" ] }, { "cell_type": "markdown", - "id": "5245c0f2", + "id": "28ae7326", "metadata": { "cell_marker": "\"\"\"" }, @@ -1965,7 +1971,7 @@ }, { "cell_type": "markdown", - "id": "ed29968a", + "id": "eaa4c950", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/13_transformers/transformers_dev.py b/modules/source/13_transformers/transformers_dev.py index 283c4078..f250ab97 100644 --- a/modules/source/13_transformers/transformers_dev.py +++ b/modules/source/13_transformers/transformers_dev.py @@ -430,12 +430,15 @@ class LayerNorm: mean = x.mean(axis=-1, keepdims=True) # Compute variance: E[(x - μ)²] - diff = Tensor(x.data - mean.data) - variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True)) + # Use Tensor operations to preserve computation graph! + diff = x - mean + variance = (diff * diff).mean(axis=-1, keepdims=True) - # Normalize - std = Tensor(np.sqrt(variance.data + self.eps)) - normalized = Tensor((x.data - mean.data) / std.data) + # Normalize - use Tensor operations to preserve gradients! + # Add eps as a Tensor for proper gradient flow + eps_tensor = Tensor(np.array(self.eps), requires_grad=False) + std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad) + normalized = (x - mean) / std # Apply learnable transformation output = normalized * self.gamma + self.beta @@ -1414,7 +1417,7 @@ def demonstrate_transformer_integration(): return model -demonstrate_transformer_integration() +# demonstrate_transformer_integration() # Moved to __main__ block below # %% [markdown] """ @@ -1641,11 +1644,12 @@ def test_module(): print("Run: tito module complete 13") # Call the comprehensive test -test_module() +# test_module() # Only run in __main__ block below # %% if __name__ == "__main__": print("🚀 Running Transformers module...") + demonstrate_transformer_integration() test_module() print("✅ Module validation complete!") diff --git a/tinytorch/models/transformer.py b/tinytorch/models/transformer.py index 8d3126cd..80903083 100644 --- a/tinytorch/models/transformer.py +++ b/tinytorch/models/transformer.py @@ -86,12 +86,15 @@ class LayerNorm: mean = x.mean(axis=-1, keepdims=True) # Compute variance: E[(x - μ)²] - diff = Tensor(x.data - mean.data) - variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True)) + # Use Tensor operations to preserve computation graph! + diff = x - mean + variance = (diff * diff).mean(axis=-1, keepdims=True) - # Normalize - std = Tensor(np.sqrt(variance.data + self.eps)) - normalized = Tensor((x.data - mean.data) / std.data) + # Normalize - use Tensor operations to preserve gradients! + # Add eps as a Tensor for proper gradient flow + eps_tensor = Tensor(np.array(self.eps), requires_grad=False) + std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad) + normalized = (x - mean) / std # Apply learnable transformation output = normalized * self.gamma + self.beta