diff --git a/modules/source/08_optimizers/module.yaml b/modules/source/08_optimizers/module.yaml
index d4b6ee8a..436e0d0d 100644
--- a/modules/source/08_optimizers/module.yaml
+++ b/modules/source/08_optimizers/module.yaml
@@ -42,7 +42,7 @@ exports:
 files:
   main: "optimizers_dev.py"
   readme: "README.md"
-  tests: "tests/test_optimizers.py"
+  # tests: inline in optimizers_dev.py
 
 # Assessment configuration
 assessment:
diff --git a/modules/source/08_optimizers/tests/test_optimizers.py b/modules/source/08_optimizers/tests/test_optimizers.py
deleted file mode 100644
index 67c0de8a..00000000
--- a/modules/source/08_optimizers/tests/test_optimizers.py
+++ /dev/null
@@ -1,379 +0,0 @@
-"""
-Tests for the optimizers module.
-
-This module tests the optimization algorithms implemented in the optimizers module:
-- Gradient descent step function
-- SGD optimizer with momentum
-- Adam optimizer with adaptive learning rates
-- Learning rate scheduling
-- Training integration
-"""
-
-import sys
-import os
-import numpy as np
-import pytest
-from pathlib import Path
-
-# Add the module to the path
-module_path = Path(__file__).parent.parent
-sys.path.insert(0, str(module_path))
-
-# Import from the optimizers module
-from optimizers_dev import (
-    gradient_descent_step,
-    SGD,
-    Adam,
-    StepLR,
-    train_simple_model
-)
-
-# Import dependencies
-sys.path.append(str(module_path / ".." / "01_tensor"))
-from tensor_dev import Tensor
-sys.path.append(str(module_path / ".." / "07_autograd"))
-from autograd_dev import Variable
-
-
-class TestGradientDescentStep:
-    """Test basic gradient descent step function."""
-    
-    def test_basic_parameter_update(self):
-        """Test basic parameter update with gradient descent."""
-        # Create parameter with gradient
-        w = Variable(2.0, requires_grad=True)
-        w.grad = Variable(0.5)
-        
-        # Apply gradient descent step
-        gradient_descent_step(w, learning_rate=0.1)
-        
-        # Check parameter was updated correctly
-        expected_value = 2.0 - 0.1 * 0.5  # 1.95
-        assert abs(w.data.data.item() - expected_value) < 1e-6
-    
-    def test_negative_gradient(self):
-        """Test parameter update with negative gradient."""
-        w = Variable(1.0, requires_grad=True)
-        w.grad = Variable(-0.2)
-        
-        gradient_descent_step(w, learning_rate=0.1)
-        
-        expected_value = 1.0 - 0.1 * (-0.2)  # 1.02
-        assert abs(w.data.data.item() - expected_value) < 1e-6
-    
-    def test_no_gradient(self):
-        """Test that parameter doesn't update when there's no gradient."""
-        w = Variable(3.0, requires_grad=True)
-        w.grad = None
-        original_value = w.data.data.item()
-        
-        gradient_descent_step(w, learning_rate=0.1)
-        
-        assert w.data.data.item() == original_value
-    
-    def test_zero_learning_rate(self):
-        """Test parameter doesn't update with zero learning rate."""
-        w = Variable(2.0, requires_grad=True)
-        w.grad = Variable(0.5)
-        original_value = w.data.data.item()
-        
-        gradient_descent_step(w, learning_rate=0.0)
-        
-        assert w.data.data.item() == original_value
-
-
-class TestSGDOptimizer:
-    """Test SGD optimizer with momentum."""
-    
-    def test_sgd_initialization(self):
-        """Test SGD optimizer initialization."""
-        w1 = Variable(1.0, requires_grad=True)
-        w2 = Variable(2.0, requires_grad=True)
-        
-        optimizer = SGD([w1, w2], learning_rate=0.01, momentum=0.9)
-        
-        assert optimizer.learning_rate == 0.01
-        assert optimizer.momentum == 0.9
-        assert optimizer.step_count == 0
-        assert len(optimizer.momentum_buffers) == 0
-    
-    def test_sgd_zero_grad(self):
-        """Test gradient zeroing functionality."""
-        w1 = Variable(1.0, requires_grad=True)
-        w2 = Variable(2.0, requires_grad=True)
-        
-        # Set some gradients
-        w1.grad = Variable(0.1)
-        w2.grad = Variable(0.2)
-        
-        optimizer = SGD([w1, w2], learning_rate=0.01)
-        optimizer.zero_grad()
-        
-        assert w1.grad is None
-        assert w2.grad is None
-    
-    def test_sgd_step_no_momentum(self):
-        """Test SGD step without momentum."""
-        w = Variable(1.0, requires_grad=True)
-        w.grad = Variable(0.1)
-        
-        optimizer = SGD([w], learning_rate=0.1, momentum=0.0)
-        optimizer.step()
-        
-        expected_value = 1.0 - 0.1 * 0.1  # 0.99
-        assert abs(w.data.data.item() - expected_value) < 1e-6
-        assert optimizer.step_count == 1
-    
-    def test_sgd_step_with_momentum(self):
-        """Test SGD step with momentum."""
-        w = Variable(1.0, requires_grad=True)
-        
-        optimizer = SGD([w], learning_rate=0.1, momentum=0.9)
-        
-        # First step
-        w.grad = Variable(0.1)
-        optimizer.step()
-        
-        # Second step with same gradient
-        w.grad = Variable(0.1)
-        optimizer.step()
-        
-        # Should have momentum buffers
-        assert len(optimizer.momentum_buffers) == 1
-        assert optimizer.step_count == 2
-    
-    def test_sgd_multiple_parameters(self):
-        """Test SGD with multiple parameters."""
-        w1 = Variable(1.0, requires_grad=True)
-        w2 = Variable(2.0, requires_grad=True)
-        b = Variable(0.5, requires_grad=True)
-        
-        optimizer = SGD([w1, w2, b], learning_rate=0.1, momentum=0.9)
-        
-        # Set gradients
-        w1.grad = Variable(0.1)
-        w2.grad = Variable(0.2)
-        b.grad = Variable(0.05)
-        
-        optimizer.step()
-        
-        assert len(optimizer.momentum_buffers) == 3
-        assert optimizer.step_count == 1
-
-
-class TestAdamOptimizer:
-    """Test Adam optimizer with adaptive learning rates."""
-    
-    def test_adam_initialization(self):
-        """Test Adam optimizer initialization."""
-        w1 = Variable(1.0, requires_grad=True)
-        w2 = Variable(2.0, requires_grad=True)
-        
-        optimizer = Adam([w1, w2], learning_rate=0.001, beta1=0.9, beta2=0.999)
-        
-        assert optimizer.learning_rate == 0.001
-        assert optimizer.beta1 == 0.9
-        assert optimizer.beta2 == 0.999
-        assert optimizer.epsilon == 1e-8
-        assert optimizer.step_count == 0
-        assert len(optimizer.first_moment) == 0
-        assert len(optimizer.second_moment) == 0
-    
-    def test_adam_zero_grad(self):
-        """Test gradient zeroing functionality."""
-        w1 = Variable(1.0, requires_grad=True)
-        w2 = Variable(2.0, requires_grad=True)
-        
-        # Set some gradients
-        w1.grad = Variable(0.1)
-        w2.grad = Variable(0.2)
-        
-        optimizer = Adam([w1, w2], learning_rate=0.001)
-        optimizer.zero_grad()
-        
-        assert w1.grad is None
-        assert w2.grad is None
-    
-    def test_adam_step(self):
-        """Test Adam optimization step."""
-        w = Variable(1.0, requires_grad=True)
-        w.grad = Variable(0.1)
-        
-        optimizer = Adam([w], learning_rate=0.001)
-        original_value = w.data.data.item()
-        
-        optimizer.step()
-        
-        # Parameter should be updated
-        assert w.data.data.item() != original_value
-        assert optimizer.step_count == 1
-        assert len(optimizer.first_moment) == 1
-        assert len(optimizer.second_moment) == 1
-    
-    def test_adam_multiple_steps(self):
-        """Test Adam with multiple optimization steps."""
-        w = Variable(1.0, requires_grad=True)
-        optimizer = Adam([w], learning_rate=0.001)
-        
-        # Run multiple steps
-        for i in range(5):
-            w.grad = Variable(0.1)
-            optimizer.step()
-        
-        assert optimizer.step_count == 5
-        assert len(optimizer.first_moment) == 1
-        assert len(optimizer.second_moment) == 1
-    
-    def test_adam_bias_correction(self):
-        """Test that Adam applies bias correction."""
-        w = Variable(1.0, requires_grad=True)
-        optimizer = Adam([w], learning_rate=0.001, beta1=0.9, beta2=0.999)
-        
-        # First step
-        w.grad = Variable(0.1)
-        optimizer.step()
-        value_after_step1 = w.data.data.item()
-        
-        # Second step with same gradient
-        w.grad = Variable(0.1)
-        optimizer.step()
-        value_after_step2 = w.data.data.item()
-        
-        # Updates should be different due to bias correction
-        step1_update = 1.0 - value_after_step1
-        step2_update = value_after_step1 - value_after_step2
-        
-        # Step sizes should be different (not strictly equal due to bias correction)
-        assert abs(step1_update - step2_update) > 1e-6
-
-
-class TestStepLRScheduler:
-    """Test step learning rate scheduler."""
-    
-    def test_steplr_initialization(self):
-        """Test StepLR scheduler initialization."""
-        w = Variable(1.0, requires_grad=True)
-        optimizer = SGD([w], learning_rate=0.1)
-        scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
-        
-        assert scheduler.step_size == 10
-        assert scheduler.gamma == 0.1
-        assert scheduler.initial_lr == 0.1
-        assert scheduler.step_count == 0
-        assert scheduler.get_lr() == 0.1
-    
-    def test_steplr_no_decay(self):
-        """Test learning rate before decay step."""
-        w = Variable(1.0, requires_grad=True)
-        optimizer = SGD([w], learning_rate=0.1)
-        scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
-        
-        # First few steps should not decay
-        for i in range(3):
-            scheduler.step()
-            assert scheduler.get_lr() == 0.1
-    
-    def test_steplr_first_decay(self):
-        """Test first learning rate decay."""
-        w = Variable(1.0, requires_grad=True)
-        optimizer = SGD([w], learning_rate=0.1)
-        scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
-        
-        # Step 3 times (no decay)
-        for i in range(3):
-            scheduler.step()
-        
-        # Step 4 should trigger decay
-        scheduler.step()
-        expected_lr = 0.1 * 0.1  # 0.01
-        assert abs(scheduler.get_lr() - expected_lr) < 1e-6
-    
-    def test_steplr_multiple_decays(self):
-        """Test multiple learning rate decays."""
-        w = Variable(1.0, requires_grad=True)
-        optimizer = SGD([w], learning_rate=0.1)
-        scheduler = StepLR(optimizer, step_size=2, gamma=0.5)
-        
-        # Step through multiple decay points
-        for i in range(6):
-            scheduler.step()
-        
-        # Should have decayed twice: 0.1 * 0.5 * 0.5 = 0.025
-        expected_lr = 0.1 * (0.5 ** 2)
-        assert abs(scheduler.get_lr() - expected_lr) < 1e-6
-    
-    def test_steplr_with_adam(self):
-        """Test StepLR scheduler with Adam optimizer."""
-        w = Variable(1.0, requires_grad=True)
-        optimizer = Adam([w], learning_rate=0.001)
-        scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
-        
-        # Step through decay point
-        for i in range(6):
-            scheduler.step()
-        
-        expected_lr = 0.001 * 0.1  # 0.0001
-        assert abs(scheduler.get_lr() - expected_lr) < 1e-6
-
-
-class TestTrainingIntegration:
-    """Test complete training integration."""
-    
-    def test_training_convergence(self):
-        """Test that training actually converges."""
-        sgd_w, sgd_b, adam_w, adam_b = train_simple_model()
-        
-        # Both optimizers should converge to reasonable values
-        # Target: w = 2.0, b = 1.0
-        assert abs(sgd_w - 2.0) < 1.0, f"SGD w should be close to 2.0, got {sgd_w}"
-        assert abs(sgd_b - 1.0) < 1.0, f"SGD b should be close to 1.0, got {sgd_b}"
-        assert abs(adam_w - 2.0) < 1.0, f"Adam w should be close to 2.0, got {adam_w}"
-        assert abs(adam_b - 1.0) < 1.0, f"Adam b should be close to 1.0, got {adam_b}"
-    
-    def test_optimizer_comparison(self):
-        """Test that both optimizers can learn the same problem."""
-        sgd_w, sgd_b, adam_w, adam_b = train_simple_model()
-        
-        # Both should learn something reasonable (not stay at initialization)
-        assert abs(sgd_w - 0.1) > 0.1, "SGD should update parameters from initialization"
-        assert abs(adam_w - 0.1) > 0.1, "Adam should update parameters from initialization"
-    
-    def test_learning_rate_scheduling_integration(self):
-        """Test that learning rate scheduling works in training."""
-        # This is tested implicitly in train_simple_model
-        # The fact that Adam training uses a scheduler and converges
-        # indicates the integration is working
-        pass
-
-
-def test_module_completeness():
-    """Test that all required components are implemented."""
-    # Test that all main classes can be imported
-    assert SGD is not None
-    assert Adam is not None
-    assert StepLR is not None
-    assert gradient_descent_step is not None
-    assert train_simple_model is not None
-    
-    # Test that classes have required methods
-    w = Variable(1.0, requires_grad=True)
-    
-    # Test SGD
-    sgd = SGD([w], learning_rate=0.01)
-    assert hasattr(sgd, 'step')
-    assert hasattr(sgd, 'zero_grad')
-    
-    # Test Adam
-    adam = Adam([w], learning_rate=0.001)
-    assert hasattr(adam, 'step')
-    assert hasattr(adam, 'zero_grad')
-    
-    # Test StepLR
-    scheduler = StepLR(sgd, step_size=10, gamma=0.1)
-    assert hasattr(scheduler, 'step')
-    assert hasattr(scheduler, 'get_lr')
-
-
-if __name__ == "__main__":
-    # Run tests when script is executed directly
-    pytest.main([__file__, "-v"]) 
\ No newline at end of file