diff --git a/tests/03_layers/test_progressive_integration.py b/tests/03_layers/test_progressive_integration.py
index 1211c3bc..6f442dae 100644
--- a/tests/03_layers/test_progressive_integration.py
+++ b/tests/03_layers/test_progressive_integration.py
@@ -108,7 +108,7 @@ class TestPriorModulesStillWork:
             🔍 IMPORT ERROR: {str(e)}
             
             🔧 HOW TO FIX:
-            1. Implement Tensor class in modules/source/02_tensor/
+            1. Implement Tensor class in modules/02_tensor/
             2. Export module: tito module complete 02_tensor
             3. Check tinytorch.core.tensor exists
             4. Verify Tensor class is exported correctly
@@ -172,7 +172,7 @@ class TestPriorModulesStillWork:
             🔍 IMPORT ERROR: {str(e)}
             
             🔧 HOW TO FIX:
-            1. Implement ReLU and Sigmoid in modules/source/03_activations/
+            1. Implement ReLU and Sigmoid in modules/03_activations/
             2. Export module: tito module complete 03_activations
             3. Check tinytorch.core.activations exists
             4. Verify activation classes are exported
@@ -240,7 +240,7 @@ class TestModule04LayersCore:
             
             🔧 HOW TO IMPLEMENT:
             
-            1. Create in modules/source/04_layers/04_layers_dev.py:
+            1. Create in modules/04_layers/04_layers.py:
             
             class Layer:
                 '''Base class for all neural network layers.'''
diff --git a/tests/06_optimizers/test_progressive_integration.py b/tests/06_optimizers/test_progressive_integration.py
index b11a89ed..02e30d00 100644
--- a/tests/06_optimizers/test_progressive_integration.py
+++ b/tests/06_optimizers/test_progressive_integration.py
@@ -240,7 +240,7 @@ class TestModule06SpatialCore:
             
             🔧 HOW TO IMPLEMENT:
             
-            1. Create in modules/source/06_spatial/06_spatial_dev.py:
+            1. Create in modules/06_spatial/06_spatial.py:
             
             from tinytorch.core.layers import Layer
             from tinytorch.core.tensor import Tensor
diff --git a/tests/08_dataloader/test_progressive_integration.py b/tests/08_dataloader/test_progressive_integration.py
index 9a13d667..d2580ce2 100644
--- a/tests/08_dataloader/test_progressive_integration.py
+++ b/tests/08_dataloader/test_progressive_integration.py
@@ -301,7 +301,7 @@ class TestModule09AutogradCore:
             
             🔧 HOW TO IMPLEMENT:
             
-            1. Create in modules/source/09_autograd/09_autograd_dev.py:
+            1. Create in modules/09_autograd/09_autograd.py:
             
             from tinytorch.core.tensor import Tensor
             
diff --git a/tests/13_transformers/test_progressive_integration.py b/tests/13_transformers/test_progressive_integration.py
index 24fb8738..8c1cc3bc 100644
--- a/tests/13_transformers/test_progressive_integration.py
+++ b/tests/13_transformers/test_progressive_integration.py
@@ -306,7 +306,7 @@ class TestModule14BenchmarkingCore:
             
             🔧 HOW TO IMPLEMENT:
             
-            1. Create in modules/source/14_benchmarking/14_benchmarking_dev.py:
+            1. Create in modules/14_benchmarking/14_benchmarking.py:
             
             import time
             import numpy as np
diff --git a/tests/14_profiling/test_progressive_integration.py b/tests/14_profiling/test_progressive_integration.py
index 2879bb23..cb7afeff 100644
--- a/tests/14_profiling/test_progressive_integration.py
+++ b/tests/14_profiling/test_progressive_integration.py
@@ -356,7 +356,7 @@ class TestModule15MLOpsCore:
             
             🔧 HOW TO IMPLEMENT:
             
-            1. Create in modules/source/15_mlops/15_mlops_dev.py:
+            1. Create in modules/15_mlops/15_mlops.py:
             
             import time
             import numpy as np
diff --git a/tests/README.md b/tests/README.md
index d15e1ee3..447878b7 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -85,7 +85,7 @@ When adding a test, ask:
 - `integration/test_gradient_flow.py` - If this fails, training is broken
 
 📚 **Module validation**:
-- Each module's inline tests (in `modules/source/`)
+- Each module's inline tests (in `modules/`)
 - Module-specific tests in `tests/XX_modulename/`
 
 ## Test Coverage Goals
diff --git a/tests/TEST_STRATEGY.md b/tests/TEST_STRATEGY.md
index 1c2e03fb..e0ddae58 100644
--- a/tests/TEST_STRATEGY.md
+++ b/tests/TEST_STRATEGY.md
@@ -13,7 +13,7 @@ This separation follows ML engineering best practices: validate components in is
 
 ## 📋 Tier 1: Inline Tests (Component Validation)
 
-### **Location**: `modules/source/XX_modulename/*_dev.py`
+### **Location**: `modules/XX_modulename/*_dev.py`
 
 ### **Purpose**:
 - Validate individual components work correctly
@@ -50,7 +50,7 @@ def test_unit_componentname():
 tito test 01_tensor --inline-only
 
 # Tests run when you execute the module file
-python modules/source/01_tensor/tensor_dev.py
+python modules/01_tensor/tensor_dev.py
 ```
 
 ### **Current Status** (Modules 01-15):
@@ -149,7 +149,7 @@ tests/
 
 ```bash
 # 1. Work on module
-cd modules/source/01_tensor
+cd modules/01_tensor
 vim tensor_dev.py
 
 # 2. Run inline tests (fast feedback)
diff --git a/tests/checkpoints/test_checkpoint_integration.py b/tests/checkpoints/test_checkpoint_integration.py
index 36e8271e..7de1b87c 100644
--- a/tests/checkpoints/test_checkpoint_integration.py
+++ b/tests/checkpoints/test_checkpoint_integration.py
@@ -56,7 +56,7 @@ class CheckpointValidator:
     
     def validate_module_exists(self, module_name: str) -> bool:
         """Check if a module file exists."""
-        module_file = self.module_path / module_name / f"{module_name.split('_')[1]}_dev.py"
+        module_file = self.module_path / module_name / f"{module_name.split('_')[1]}.py"
         return module_file.exists()
     
     def validate_module_exports(self, module_name: str) -> Tuple[bool, List[str]]:
diff --git a/tests/integration/module_complete_orchestrator.py b/tests/integration/module_complete_orchestrator.py
index cfc94856..a0087aa7 100644
--- a/tests/integration/module_complete_orchestrator.py
+++ b/tests/integration/module_complete_orchestrator.py
@@ -112,7 +112,7 @@ class ModuleCompletionOrchestrator:
         """Export module using nbdev."""
         try:
             # Run nbdev_export for the specific module
-            cmd = ["nbdev_export", "--path", f"modules/source/{module_name}/{module_name}_dev.py"]
+            cmd = ["nbdev_export", "--path", f"modules/{module_name}/{module_name}.py"]
             result = subprocess.run(cmd, capture_output=True, text=True)
             
             if result.returncode == 0:
diff --git a/tests/integration/test_optimizers_integration.py b/tests/integration/test_optimizers_integration.py
index 8dc0bca9..06078425 100644
--- a/tests/integration/test_optimizers_integration.py
+++ b/tests/integration/test_optimizers_integration.py
@@ -18,12 +18,12 @@ module_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'
 sys.path.insert(0, module_path)
 
 # Import modules in dependency order
-exec(open(os.path.join(module_path, '01_tensor/tensor_dev.py')).read())
-exec(open(os.path.join(module_path, '02_activations/activations_dev.py')).read())
-exec(open(os.path.join(module_path, '03_layers/layers_dev.py')).read())
-exec(open(os.path.join(module_path, '05_autograd/autograd_dev.py')).read())
-exec(open(os.path.join(module_path, '04_losses/losses_dev.py')).read())
-exec(open(os.path.join(module_path, '06_optimizers/optimizers_dev.py')).read())
+exec(open(os.path.join(module_path, '01_tensor/tensor.py')).read())
+exec(open(os.path.join(module_path, '02_activations/activations.py')).read())
+exec(open(os.path.join(module_path, '03_layers/layers.py')).read())
+exec(open(os.path.join(module_path, '05_autograd/autograd.py')).read())
+exec(open(os.path.join(module_path, '04_losses/losses.py')).read())
+exec(open(os.path.join(module_path, '06_optimizers/optimizers.py')).read())
 
 def test_sgd_with_linear_layer():
     """Test SGD optimizer with Linear layer and autograd."""
diff --git a/tests/regression/test_gradient_flow_fixes.py b/tests/regression/test_gradient_flow_fixes.py
index cd114739..d8f71ef2 100644
--- a/tests/regression/test_gradient_flow_fixes.py
+++ b/tests/regression/test_gradient_flow_fixes.py
@@ -34,7 +34,7 @@ def test_regression_batched_matmul():
     Regression test for Issue #1: np.dot doesn't handle batched 3D matmul.
     
     Bug: Using np.dot for 3D tensors produces wrong shapes.
-    Fix: Changed to np.matmul in modules/source/01_tensor/tensor_dev.py
+    Fix: Changed to np.matmul in modules/01_tensor/tensor.py
     Commit: Module 01 fixes
     """
     print("Testing regression: batched 3D matmul...")
@@ -59,7 +59,7 @@ def test_regression_transpose_requires_grad():
     Regression test for Issue #2: transpose() not preserving requires_grad.
     
     Bug: x.transpose() created Tensor without requires_grad.
-    Fix: Added requires_grad parameter in modules/source/01_tensor/tensor_dev.py
+    Fix: Added requires_grad parameter in modules/01_tensor/tensor.py
     Commit: Module 01 fixes
     """
     print("Testing regression: transpose requires_grad...")
diff --git a/tests/test_gradient_flow.py b/tests/test_gradient_flow.py
new file mode 100644
index 00000000..1e66f55a
--- /dev/null
+++ b/tests/test_gradient_flow.py
@@ -0,0 +1,436 @@
+#!/usr/bin/env python3
+"""
+Comprehensive Gradient Flow Tests for TinyTorch
+================================================
+
+Tests that gradients flow correctly through:
+1. Simple networks (single layer)
+2. Multi-layer networks (MLP)
+3. Convolutional networks (CNN)
+4. Attention mechanisms
+5. Complete training loops
+
+This ensures backpropagation works correctly end-to-end.
+"""
+
+import sys
+import os
+import numpy as np
+
+# Add project root to path
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+
+from tinytorch.core.tensor import Tensor
+from tinytorch.core.layers import Linear, Dropout
+from tinytorch.core.activations import ReLU, Sigmoid, Softmax
+from tinytorch.core.losses import MSELoss, BinaryCrossEntropyLoss, CrossEntropyLoss
+from tinytorch.core.optimizers import SGD, Adam
+from tinytorch.core.spatial import Conv2d, MaxPool2d
+from tinytorch.core.autograd import enable_autograd
+
+# Enable autograd
+enable_autograd()
+
+def test_simple_linear_gradient_flow():
+    """Test gradients flow through a single linear layer"""
+    print("\n" + "="*70)
+    print("TEST 1: Simple Linear Layer Gradient Flow")
+    print("="*70)
+
+    # Create simple network: Linear(2->1)
+    layer = Linear(2, 1)
+
+    # Input
+    x = Tensor([[1.0, 2.0]], requires_grad=True)
+    target = Tensor([[3.0]])
+
+    # Forward pass
+    output = layer.forward(x)
+
+    # Loss
+    loss_fn = MSELoss()
+    loss = loss_fn.forward(output, target)
+
+    print(f"Initial loss: {float(loss.data):.4f}")
+    print(f"Initial weight shape: {layer.weight.shape}")
+    print(f"Initial bias shape: {layer.bias.shape}")
+
+    # Backward pass
+    loss.backward()
+
+    # Check gradients exist
+    assert layer.weight.grad is not None, "Weight gradient is None!"
+    assert layer.bias.grad is not None, "Bias gradient is None!"
+    assert x.grad is not None, "Input gradient is None!"
+
+    # Check gradients are non-zero
+    weight_grad_norm = np.linalg.norm(layer.weight.grad.data)
+    bias_grad_norm = np.linalg.norm(layer.bias.grad.data)
+    input_grad_norm = np.linalg.norm(x.grad.data)
+
+    print(f"\n✓ Weight gradient norm: {weight_grad_norm:.6f}")
+    print(f"✓ Bias gradient norm: {bias_grad_norm:.6f}")
+    print(f"✓ Input gradient norm: {input_grad_norm:.6f}")
+
+    assert weight_grad_norm > 1e-6, f"Weight gradients too small: {weight_grad_norm}"
+    assert bias_grad_norm > 1e-6, f"Bias gradients too small: {bias_grad_norm}"
+    assert input_grad_norm > 1e-6, f"Input gradients too small: {input_grad_norm}"
+
+    print("\n✅ TEST PASSED: Gradients flow correctly through linear layer")
+    return True
+
+
+def test_mlp_gradient_flow():
+    """Test gradients flow through multi-layer perceptron"""
+    print("\n" + "="*70)
+    print("TEST 2: Multi-Layer Perceptron Gradient Flow")
+    print("="*70)
+
+    # Create MLP: Input(4) -> Linear(4->8) -> ReLU -> Linear(8->2)
+    layer1 = Linear(4, 8)
+    activation = ReLU()
+    layer2 = Linear(8, 2)
+
+    # Input and target
+    x = Tensor(np.random.randn(3, 4), requires_grad=True)
+    target = Tensor(np.array([[1, 0], [0, 1], [1, 0]]))
+
+    print(f"Input shape: {x.shape}")
+    print(f"Target shape: {target.shape}")
+
+    # Forward pass
+    h1 = layer1.forward(x)
+    h1_activated = activation.forward(h1)
+    output = layer2.forward(h1_activated)
+
+    print(f"Hidden layer shape: {h1.shape}")
+    print(f"Output shape: {output.shape}")
+
+    # Loss
+    loss_fn = MSELoss()
+    loss = loss_fn.forward(output, target)
+
+    print(f"Initial loss: {float(loss.data):.4f}")
+
+    # Backward pass
+    loss.backward()
+
+    # Check all layer gradients exist
+    assert layer1.weight.grad is not None, "Layer1 weight gradient is None!"
+    assert layer1.bias.grad is not None, "Layer1 bias gradient is None!"
+    assert layer2.weight.grad is not None, "Layer2 weight gradient is None!"
+    assert layer2.bias.grad is not None, "Layer2 bias gradient is None!"
+
+    # Check gradient magnitudes
+    l1_weight_norm = np.linalg.norm(layer1.weight.grad.data)
+    l1_bias_norm = np.linalg.norm(layer1.bias.grad.data)
+    l2_weight_norm = np.linalg.norm(layer2.weight.grad.data)
+    l2_bias_norm = np.linalg.norm(layer2.bias.grad.data)
+
+    print(f"\n✓ Layer1 weight gradient norm: {l1_weight_norm:.6f}")
+    print(f"✓ Layer1 bias gradient norm: {l1_bias_norm:.6f}")
+    print(f"✓ Layer2 weight gradient norm: {l2_weight_norm:.6f}")
+    print(f"✓ Layer2 bias gradient norm: {l2_bias_norm:.6f}")
+
+    assert l1_weight_norm > 1e-6, "Layer1 weight gradients too small"
+    assert l1_bias_norm > 1e-6, "Layer1 bias gradients too small"
+    assert l2_weight_norm > 1e-6, "Layer2 weight gradients too small"
+    assert l2_bias_norm > 1e-6, "Layer2 bias gradients too small"
+
+    print("\n✅ TEST PASSED: Gradients flow correctly through MLP")
+    return True
+
+
+def test_mlp_training_updates():
+    """Test that MLP actually learns (loss decreases)"""
+    print("\n" + "="*70)
+    print("TEST 3: MLP Training - Loss Reduction")
+    print("="*70)
+
+    # Create simple MLP
+    layer1 = Linear(2, 4)
+    activation = ReLU()
+    layer2 = Linear(4, 1)
+
+    # Simple dataset (XOR-like)
+    X = Tensor(np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]), requires_grad=False)
+    y = Tensor(np.array([[0.0], [1.0], [1.0], [0.0]]))
+
+    # Optimizer
+    optimizer = SGD([layer1.weight, layer1.bias, layer2.weight, layer2.bias], lr=0.1)
+    loss_fn = MSELoss()
+
+    losses = []
+
+    print("Training for 50 epochs...")
+    for epoch in range(50):
+        # Forward
+        h1 = layer1.forward(X)
+        h1_act = activation.forward(h1)
+        output = layer2.forward(h1_act)
+
+        # Loss
+        loss = loss_fn.forward(output, y)
+        losses.append(float(loss.data))
+
+        # Backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Update
+        optimizer.step()
+
+        if (epoch + 1) % 10 == 0:
+            print(f"Epoch {epoch+1:2d}: Loss = {float(loss.data):.6f}")
+
+    # Check loss decreased
+    initial_loss = losses[0]
+    final_loss = losses[-1]
+    reduction = initial_loss - final_loss
+    reduction_pct = (reduction / initial_loss) * 100
+
+    print(f"\n✓ Initial loss: {initial_loss:.6f}")
+    print(f"✓ Final loss: {final_loss:.6f}")
+    print(f"✓ Reduction: {reduction:.6f} ({reduction_pct:.1f}%)")
+
+    assert final_loss < initial_loss, f"Loss didn't decrease! Initial: {initial_loss}, Final: {final_loss}"
+    assert reduction_pct > 10, f"Loss reduction too small: {reduction_pct:.1f}%"
+
+    print("\n✅ TEST PASSED: MLP learns successfully (loss decreases)")
+    return True
+
+
+def test_cnn_gradient_flow():
+    """Test gradients flow through convolutional layers"""
+    print("\n" + "="*70)
+    print("TEST 4: CNN Gradient Flow")
+    print("="*70)
+
+    # Create simple CNN: Conv2d -> ReLU -> Linear
+    conv = Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=0)
+    activation = ReLU()
+
+    # Input: batch=2, channels=1, height=8, width=8
+    x = Tensor(np.random.randn(2, 1, 8, 8), requires_grad=True)
+
+    print(f"Input shape: {x.shape}")
+    print(f"Conv weight shape: {conv.weight.shape}")
+
+    # Forward through conv
+    conv_out = conv.forward(x)
+    print(f"Conv output shape: {conv_out.shape}")
+
+    activated = activation.forward(conv_out)
+
+    # Flatten for linear layer
+    batch_size = activated.shape[0]
+    flattened_size = np.prod(activated.shape[1:])
+    # Use reshape method to maintain gradient flow
+    flattened = activated.reshape(batch_size, flattened_size)
+
+    linear = Linear(flattened_size, 2)
+    output = linear.forward(flattened)
+
+    print(f"Flattened shape: {flattened.shape}")
+    print(f"Output shape: {output.shape}")
+
+    # Loss
+    target = Tensor(np.array([[1, 0], [0, 1]]))
+    loss_fn = MSELoss()
+    loss = loss_fn.forward(output, target)
+
+    print(f"Initial loss: {float(loss.data):.4f}")
+
+    # Backward
+    loss.backward()
+
+    # Check gradients
+    assert conv.weight.grad is not None, "Conv weight gradient is None!"
+    assert conv.bias.grad is not None, "Conv bias gradient is None!"
+    assert linear.weight.grad is not None, "Linear weight gradient is None!"
+
+    weight_grad_norm = np.linalg.norm(conv.weight.grad.data)
+    conv_bias_norm = np.linalg.norm(conv.bias.grad.data)
+    linear_grad_norm = np.linalg.norm(linear.weight.grad.data)
+
+    print(f"\n✓ Conv weight gradient norm: {weight_grad_norm:.6f}")
+    print(f"✓ Conv bias gradient norm: {conv_bias_norm:.6f}")
+    print(f"✓ Linear weight gradient norm: {linear_grad_norm:.6f}")
+
+    assert weight_grad_norm > 1e-6, f"Conv weight gradients too small: {weight_grad_norm}"
+    assert conv_bias_norm > 1e-6, f"Conv bias gradients too small: {conv_bias_norm}"
+    assert linear_grad_norm > 1e-6, f"Linear gradients too small: {linear_grad_norm}"
+
+    print("\n✅ TEST PASSED: Gradients flow correctly through CNN")
+    return True
+
+
+def test_cnn_training_updates():
+    """Test that CNN actually learns on simple data"""
+    print("\n" + "="*70)
+    print("TEST 5: CNN Training - Loss Reduction")
+    print("="*70)
+
+    # Simple CNN
+    conv = Conv2d(1, 2, kernel_size=3, stride=1, padding=1)
+    activation = ReLU()
+
+    # Simple data: 4 samples, 1 channel, 4x4 images
+    X = Tensor(np.random.randn(4, 1, 4, 4), requires_grad=False)
+
+    # After conv: (4, 2, 4, 4) -> flatten to (4, 32)
+    conv_out_size = 2 * 4 * 4  # channels * height * width
+    linear = Linear(conv_out_size, 2)
+
+    y = Tensor(np.array([[1, 0], [0, 1], [1, 0], [0, 1]]))
+
+    # Get parameters with gradients
+    params = []
+    for p in [conv.weight, conv.bias, linear.weight, linear.bias]:
+        if not p.requires_grad:
+            p.requires_grad = True
+        params.append(p)
+
+    # Optimizer
+    optimizer = SGD(params, lr=0.01)
+    loss_fn = MSELoss()
+
+    losses = []
+
+    print("Training for 30 epochs...")
+    for epoch in range(30):
+        # Forward
+        conv_out = conv.forward(X)
+        activated = activation.forward(conv_out)
+
+        # Flatten using reshape to maintain gradients
+        batch_size = activated.shape[0]
+        flattened = activated.reshape(batch_size, -1)
+
+        output = linear.forward(flattened)
+
+        # Loss
+        loss = loss_fn.forward(output, y)
+        losses.append(float(loss.data))
+
+        # Backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Update
+        optimizer.step()
+
+        if (epoch + 1) % 10 == 0:
+            print(f"Epoch {epoch+1:2d}: Loss = {float(loss.data):.6f}")
+
+    # Check loss decreased
+    initial_loss = losses[0]
+    final_loss = losses[-1]
+    reduction = initial_loss - final_loss
+    reduction_pct = (reduction / initial_loss) * 100
+
+    print(f"\n✓ Initial loss: {initial_loss:.6f}")
+    print(f"✓ Final loss: {final_loss:.6f}")
+    print(f"✓ Reduction: {reduction:.6f} ({reduction_pct:.1f}%)")
+
+    assert final_loss < initial_loss, f"Loss didn't decrease! Initial: {initial_loss}, Final: {final_loss}"
+
+    print("\n✅ TEST PASSED: CNN learns successfully (loss decreases)")
+    return True
+
+
+def test_gradient_accumulation():
+    """Test that gradients accumulate correctly across batches"""
+    print("\n" + "="*70)
+    print("TEST 6: Gradient Accumulation")
+    print("="*70)
+
+    layer = Linear(2, 1)
+
+    # Two batches
+    x1 = Tensor([[1.0, 2.0]], requires_grad=True)
+    x2 = Tensor([[3.0, 4.0]], requires_grad=True)
+    target = Tensor([[1.0]])
+
+    loss_fn = MSELoss()
+
+    # Forward + backward on first batch (don't zero grad)
+    out1 = layer.forward(x1)
+    loss1 = loss_fn.forward(out1, target)
+    loss1.backward()
+
+    grad_after_first = np.array(layer.weight.grad.data)
+
+    # Forward + backward on second batch (gradients should accumulate)
+    out2 = layer.forward(x2)
+    loss2 = loss_fn.forward(out2, target)
+    loss2.backward()
+
+    grad_after_second = layer.weight.grad.data
+
+    # Gradients should have accumulated (not been replaced)
+    grad_diff = np.linalg.norm(grad_after_second - grad_after_first)
+
+    print(f"✓ Gradient after first batch norm: {np.linalg.norm(grad_after_first):.6f}")
+    print(f"✓ Gradient after second batch norm: {np.linalg.norm(grad_after_second):.6f}")
+    print(f"✓ Difference: {grad_diff:.6f}")
+
+    assert grad_diff > 1e-6, "Gradients didn't accumulate properly"
+
+    print("\n✅ TEST PASSED: Gradients accumulate correctly")
+    return True
+
+
+def main():
+    """Run all gradient flow tests"""
+    print("\n" + "="*70)
+    print("  TINYTORCH GRADIENT FLOW TEST SUITE")
+    print("="*70)
+
+    tests = [
+        ("Simple Linear", test_simple_linear_gradient_flow),
+        ("MLP Gradient Flow", test_mlp_gradient_flow),
+        ("MLP Training", test_mlp_training_updates),
+        ("CNN Gradient Flow", test_cnn_gradient_flow),
+        ("CNN Training", test_cnn_training_updates),
+        ("Gradient Accumulation", test_gradient_accumulation),
+    ]
+
+    results = []
+
+    for name, test_func in tests:
+        try:
+            result = test_func()
+            results.append((name, "PASSED" if result else "FAILED"))
+        except Exception as e:
+            print(f"\n❌ TEST FAILED: {name}")
+            print(f"Error: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            results.append((name, "FAILED"))
+
+    # Summary
+    print("\n" + "="*70)
+    print("  TEST SUMMARY")
+    print("="*70)
+
+    passed = sum(1 for _, status in results if status == "PASSED")
+    total = len(results)
+
+    for name, status in results:
+        symbol = "✅" if status == "PASSED" else "❌"
+        print(f"{symbol} {name}: {status}")
+
+    print(f"\nTotal: {passed}/{total} tests passed")
+
+    if passed == total:
+        print("\n🎉 ALL TESTS PASSED! Gradients flow correctly through TinyTorch.")
+        return 0
+    else:
+        print(f"\n⚠️  {total - passed} tests failed. Please review the errors above.")
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main())