"""
Module 09: Progressive Integration Tests
Tests that Module 09 (Autograd) works correctly AND that the entire prior stack (01→08) still works.

DEPENDENCY CHAIN: 01_setup → 02_tensor → 03_activations → 04_layers → 05_dense → 06_spatial → 07_attention → 08_dataloader → 09_autograd
This is where we enable automatic differentiation - the foundation of neural network training.

🎯 WHAT THIS TESTS:
- Module 09: Automatic gradient computation, computation graphs, backpropagation
- Integration: Autograd works with all previous modules (tensors, layers, data)
- Regression: Complete ML pipeline (01→08) still works correctly
- Preparation: Ready for optimizers (Module 10) and training (Module 11)

💡 FOR STUDENTS: If tests fail, check:
1. Does your Variable class exist in tinytorch.core.autograd?
2. Does Variable track gradients and build computation graphs?
3. Does backward() compute gradients correctly?
4. Do gradients flow through all layer types?

🔧 DEBUGGING HELP:
- Variable wraps Tensor and tracks operations
- Forward pass builds computation graph
- Backward pass computes gradients via chain rule
- Each operation needs forward() and backward() methods
"""

import numpy as np
import sys
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))


class TestCompleteMLPipelineStillWorks:
    """
    🔄 REGRESSION CHECK: Verify complete ML pipeline (01→08) still works after autograd development.
    
    💡 If these fail: You may have broken something in the ML pipeline while implementing autograd.
    🔧 Fix: Check that autograd doesn't interfere with basic forward pass functionality.
    """
    
    def test_end_to_end_ml_pipeline_stable(self):
        """
        ✅ TEST: Complete ML pipeline (data → model → output) should still work
        
        📋 FULL PIPELINE COMPONENTS:
        - Data loading and batching
        - CNN feature extraction
        - Dense classification layers
        - Activation functions
        - End-to-end predictions
        
        🚨 IF FAILS: Core ML pipeline broken by autograd development
        """
        try:
            # Test complete pipeline still works
            from tinytorch.core.tensor import Tensor
            from tinytorch.core.spatial import Conv2D, MaxPool2D
            from tinytorch.core.layers import Dense
            from tinytorch.core.activations import ReLU, Softmax
            from tinytorch.core.data import Dataset, DataLoader
            
            # Create simple dataset
            class TestDataset(Dataset):
                def __init__(self):
                    self.data = np.random.randn(20, 3, 32, 32)
                    self.targets = np.random.randint(0, 10, 20)
                
                def __len__(self):
                    return 20
                
                def __getitem__(self, idx):
                    return Tensor(self.data[idx]), self.targets[idx]
            
            # Create model components
            conv = Conv2D(3, 16, kernel_size=3, padding=1)
            pool = MaxPool2D(kernel_size=2)
            dense = Dense(16 * 16 * 16, 10)  # 4096 -> 10
            relu = ReLU()
            softmax = Softmax()
            
            # Create data loader
            dataset = TestDataset()
            dataloader = DataLoader(dataset, batch_size=4)
            
            # Test end-to-end pipeline
            for batch_x, batch_y in dataloader:
                # CNN feature extraction
                conv_out = relu(conv(batch_x))      # (4, 16, 32, 32)
                pooled = pool(conv_out)             # (4, 16, 16, 16)
                
                # Flatten for dense layer
                flattened = Tensor(pooled.data.reshape(4, -1))  # (4, 4096)
                
                # Classification
                logits = dense(flattened)           # (4, 10)
                probs = softmax(logits)             # (4, 10)
                
                # Verify pipeline works
                assert probs.shape == (4, 10), \
                    f"❌ ML pipeline shape broken. Expected (4, 10), got {probs.shape}"
                
                # Verify probabilities
                prob_sums = np.sum(probs.data, axis=1)
                assert np.allclose(prob_sums, 1.0), \
                    f"❌ ML pipeline probabilities broken: {prob_sums}"
                
                break  # Test one batch
                
        except ImportError as e:
            assert False, f"""
            ❌ ML PIPELINE IMPORTS BROKEN!
            
            🔍 IMPORT ERROR: {str(e)}
            
            🔧 PIPELINE REQUIREMENTS:
            All previous modules (01→08) must be working:
            1. Tensor operations (Module 02)
            2. Activation functions (Module 03)
            3. Layer base class (Module 04)
            4. Dense layers (Module 05)
            5. Spatial operations (Module 06)
            6. Attention mechanisms (Module 07)
            7. Data loading (Module 08)
            
            💡 DEBUG STEPS:
            1. Test each module individually
            2. Check exports: tito module complete XX_modulename
            3. Verify no circular imports with autograd
            4. Test pipeline components separately
            """
        except Exception as e:
            assert False, f"""
            ❌ ML PIPELINE FUNCTIONALITY BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 POSSIBLE CAUSES:
            1. Autograd interfering with forward pass
            2. Tensor operations corrupted
            3. Layer inheritance broken
            4. Data loading pipeline issues
            5. Memory or shape problems
            
            💡 AUTOGRAD SAFETY:
            Autograd should be ADDITIVE - it adds gradient tracking
            but doesn't break existing forward pass functionality.
            
            🧪 DEBUG CHECKLIST:
            □ Forward pass works without autograd?
            □ All modules import correctly?
            □ No circular dependencies?
            □ Tensor operations unchanged?
            □ Layer interfaces preserved?
            """
    
    def test_attention_and_spatial_integration_stable(self):
        """
        ✅ TEST: Advanced architectures (attention + CNN) should still work
        
        📋 ADVANCED INTEGRATION:
        - Spatial processing (Conv2D, pooling)
        - Attention mechanisms
        - Multi-modal architectures
        - Complex data flows
        
        🎯 Ensures autograd doesn't break sophisticated models
        """
        try:
            from tinytorch.core.tensor import Tensor
            from tinytorch.core.spatial import Conv2D
            from tinytorch.core.attention import MultiHeadAttention
            from tinytorch.core.layers import Dense
            from tinytorch.core.activations import ReLU
            
            # Test sophisticated architecture integration
            # Vision + attention (like Vision Transformer components)
            
            # Vision processing
            cnn = Conv2D(3, 64, kernel_size=3, padding=1)
            vision_proj = Dense(64 * 32 * 32, 256)  # Project spatial features
            
            # Attention processing
            attention = MultiHeadAttention(embed_dim=256, num_heads=8)
            
            # Activations
            relu = ReLU()
            
            # Test multi-modal pipeline
            # Image input
            images = Tensor(np.random.randn(2, 3, 32, 32))
            
            # Vision pathway
            vision_features = relu(cnn(images))     # (2, 64, 32, 32)
            vision_flat = Tensor(vision_features.data.reshape(2, -1))  # (2, 65536)
            vision_embed = vision_proj(vision_flat)  # (2, 256)
            
            # Attention pathway (treating as sequence)
            # Reshape for attention: (seq_len, batch, embed_dim)
            seq_embed = Tensor(vision_embed.data.reshape(1, 2, 256))
            attention_out = attention(seq_embed)     # (1, 2, 256)
            
            # Verify advanced integration
            assert attention_out.shape == (1, 2, 256), \
                f"❌ Advanced integration broken. Expected (1, 2, 256), got {attention_out.shape}"
            
            # Verify meaningful processing
            assert not np.allclose(attention_out.data, 0), \
                "❌ Advanced integration produces zero outputs"
            
        except Exception as e:
            assert False, f"""
            ❌ ADVANCED ARCHITECTURE INTEGRATION BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 ADVANCED REQUIREMENTS:
            1. CNN spatial processing must work
            2. Attention mechanisms must work
            3. Dense projections must work
            4. Multi-modal data flows must work
            5. Complex architectures must integrate
            
            💡 WHAT THIS TESTS:
            Modern AI architectures combine:
            - Computer vision (CNNs)
            - Natural language processing (attention)
            - Multimodal understanding
            - Complex data transformations
            
            🧪 COMPONENT ISOLATION:
            Test each component separately:
            1. CNN: conv = Conv2D(3, 16, 3); out = conv(x)
            2. Attention: attn = MultiHeadAttention(64, 4); out = attn(x)
            3. Dense: dense = Dense(100, 50); out = dense(x)
            4. Integration: Combine all components step by step
            """


class TestModule09AutogradCore:
    """
    🆕 NEW FUNCTIONALITY: Test Module 09 (Autograd) core implementation.
    
    💡 What you're implementing: Automatic differentiation for gradient-based learning.
    🎯 Goal: Enable gradient computation for neural network training.
    """
    
    def test_variable_wrapper_exists(self):
        """
        ✅ TEST: Variable wrapper - Tensors that track gradients
        
        📋 WHAT YOU NEED TO IMPLEMENT:
        class Variable:
            def __init__(self, tensor, requires_grad=False):
                self.data = tensor
                self.grad = None
                self.requires_grad = requires_grad
                self.grad_fn = None  # For computation graph
        
        🚨 IF FAILS: Variable wrapper doesn't exist or missing components
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            
            # Test Variable creation
            x = Variable(Tensor([1.0, 2.0, 3.0]), requires_grad=True)
            
            # Should wrap tensor data
            assert hasattr(x, 'data'), \
                "❌ Variable missing 'data' attribute to store tensor"
            
            assert isinstance(x.data, Tensor), \
                f"❌ Variable.data should be Tensor, got {type(x.data)}"
            
            # Should track gradient requirements
            assert hasattr(x, 'requires_grad'), \
                "❌ Variable missing 'requires_grad' attribute"
            
            assert x.requires_grad == True, \
                "❌ Variable requires_grad not set correctly"
            
            # Should have gradient storage
            assert hasattr(x, 'grad'), \
                "❌ Variable missing 'grad' attribute for storing gradients"
            
            # Gradient should start as None
            assert x.grad is None, \
                "❌ Variable.grad should start as None before backward pass"
            
            # Should have computation graph tracking
            assert hasattr(x, 'grad_fn'), \
                "❌ Variable missing 'grad_fn' for computation graph"
            
        except ImportError as e:
            assert False, f"""
            ❌ VARIABLE WRAPPER MISSING!
            
            🔍 IMPORT ERROR: {str(e)}
            
            🔧 HOW TO IMPLEMENT:
            
            1. Create in modules/source/09_autograd/09_autograd_dev.py:
            
            from tinytorch.core.tensor import Tensor
            
            class Variable:
                '''Tensor wrapper that enables automatic differentiation.'''
                
                def __init__(self, data, requires_grad=False):
                    if isinstance(data, Tensor):
                        self.data = data
                    else:
                        self.data = Tensor(data)
                    
                    self.requires_grad = requires_grad
                    self.grad = None  # Gradient will be computed here
                    self.grad_fn = None  # Function that created this variable
                
                def backward(self, gradient=None):
                    '''Compute gradients via backpropagation.'''
                    if not self.requires_grad:
                        return
                    
                    if gradient is None:
                        # Scalar output - gradient is 1
                        gradient = Tensor(np.ones_like(self.data.data))
                    
                    # Accumulate gradients
                    if self.grad is None:
                        self.grad = gradient
                    else:
                        self.grad = Tensor(self.grad.data + gradient.data)
                    
                    # Propagate to dependencies
                    if self.grad_fn is not None:
                        self.grad_fn.backward(gradient)
                
                def __repr__(self):
                    return f"Variable(data={self.data}, requires_grad={self.requires_grad})"
            
            2. Export the module:
               tito module complete 09_autograd
            
            📚 AUTOGRAD CONCEPTS:
            - Variable: Tensor + gradient tracking
            - Computation Graph: DAG of operations
            - Backward Pass: Chain rule applied automatically
            - grad_fn: Links to operation that created variable
            """
        except Exception as e:
            assert False, f"""
            ❌ VARIABLE WRAPPER BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 VARIABLE REQUIREMENTS:
            1. Wrap Tensor data
            2. Track requires_grad flag
            3. Store gradients in .grad attribute
            4. Support computation graph via grad_fn
            5. Enable backward() method for gradient computation
            
            💡 AUTOGRAD FOUNDATION:
            Variable is the foundation of automatic differentiation:
            - PyTorch torch.Tensor (with requires_grad=True)
            - TensorFlow tf.Variable
            - JAX jax.numpy arrays (with jax.grad)
            
            All modern deep learning relies on automatic differentiation!
            """
    
    def test_gradient_computation(self):
        """
        ✅ TEST: Gradient computation - Core of backpropagation
        
        📋 GRADIENT COMPUTATION:
        - Forward pass: Compute outputs and build computation graph
        - Backward pass: Apply chain rule to compute gradients
        - Gradient accumulation: Handle multiple paths to same variable
        
        🎯 This is what enables neural network training
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            
            # Test simple gradient computation
            # y = x^2, dy/dx = 2x
            x = Variable(Tensor([2.0]), requires_grad=True)
            
            # Forward pass (need to implement operations that track gradients)
            # For now, test basic gradient setting and accumulation
            
            # Simulate backward pass manually
            x.backward(Tensor([1.0]))  # Gradient from output
            
            # Check gradient was computed
            assert x.grad is not None, \
                "❌ Gradient not computed. x.grad should not be None after backward()"
            
            assert isinstance(x.grad, Tensor), \
                f"❌ Gradient should be Tensor, got {type(x.grad)}"
            
            # Test gradient accumulation
            x.grad = None  # Reset
            x.backward(Tensor([2.0]))  # First gradient
            first_grad = x.grad.data.copy()
            
            x.backward(Tensor([3.0]))  # Second gradient (should accumulate)
            
            expected_accumulated = first_grad + np.array([3.0])
            assert np.array_equal(x.grad.data, expected_accumulated), \
                f"❌ Gradient accumulation broken. Expected {expected_accumulated}, got {x.grad.data}"
            
        except Exception as e:
            assert False, f"""
            ❌ GRADIENT COMPUTATION BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 GRADIENT COMPUTATION REQUIREMENTS:
            1. backward() method computes and stores gradients
            2. Gradients accumulate (add) when backward() called multiple times
            3. Gradients are Tensor objects
            4. Handle scalar and vector gradients correctly
            
            💡 GRADIENT COMPUTATION EXAMPLE:
            
            # Simple function: y = x^2
            x = Variable(Tensor([3.0]), requires_grad=True)
            y = x * x  # Forward pass (need to implement * operation)
            y.backward()  # Backward pass
            print(x.grad.data)  # Should be [6.0] since dy/dx = 2x = 2*3 = 6
            
            🧮 CHAIN RULE:
            For composite functions f(g(x)):
            df/dx = (df/dg) * (dg/dx)
            
            Autograd applies this automatically!
            """
    
    def test_computation_graph_building(self):
        """
        ✅ TEST: Computation graph - Track operations for backpropagation
        
        📋 COMPUTATION GRAPH:
        - Nodes: Variables (tensors with gradients)
        - Edges: Operations (add, multiply, conv, etc.)
        - Forward: Build graph while computing
        - Backward: Traverse graph to compute gradients
        
        💡 This enables automatic differentiation
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            
            # Test computation graph structure
            x = Variable(Tensor([1.0]), requires_grad=True)
            y = Variable(Tensor([2.0]), requires_grad=True)
            
            # Test that variables can track their creation
            assert x.grad_fn is None, \
                "❌ Leaf variables should have grad_fn=None"
            
            assert y.grad_fn is None, \
                "❌ Leaf variables should have grad_fn=None"
            
            # For more complex operations, would need to implement ops
            # For now, test manual grad_fn setting
            
            class AddFunction:
                def __init__(self, x, y):
                    self.x = x
                    self.y = y
                
                def backward(self, gradient):
                    # d(x+y)/dx = 1, d(x+y)/dy = 1
                    if self.x.requires_grad:
                        self.x.backward(gradient)
                    if self.y.requires_grad:
                        self.y.backward(gradient)
            
            # Simulate z = x + y
            z_data = Tensor([x.data.data[0] + y.data.data[0]])
            z = Variable(z_data, requires_grad=True)
            z.grad_fn = AddFunction(x, y)
            
            # Test backward through computation graph
            z.backward(Tensor([1.0]))
            
            # Both x and y should receive gradients
            assert x.grad is not None, \
                "❌ Gradient didn't flow to x through computation graph"
            
            assert y.grad is not None, \
                "❌ Gradient didn't flow to y through computation graph"
            
        except Exception as e:
            assert False, f"""
            ❌ COMPUTATION GRAPH BUILDING BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 COMPUTATION GRAPH REQUIREMENTS:
            1. Variables track how they were created (grad_fn)
            2. Operations link inputs and outputs
            3. Backward pass traverses graph in reverse
            4. Gradients flow to all contributing variables
            5. Leaf variables (inputs) have grad_fn=None
            
            💡 COMPUTATION GRAPH EXAMPLE:
            
                x (leaf)    y (leaf)
                 \\         /
                  \\       /
                   AddOp
                     |
                     z
            
            Backward pass:
            1. z.backward() starts with dz/dz = 1
            2. AddOp.backward() computes dx and dy
            3. x.grad = dz/dx, y.grad = dz/dy
            
            🔗 GRAPH STRUCTURE:
            - Leaf nodes: Input variables (x, y)
            - Internal nodes: Operation results
            - Edges: Dependencies between operations
            - Backward: Reverse traversal with chain rule
            """


class TestAutogradIntegration:
    """
    🔗 INTEGRATION TEST: Autograd + All previous modules working together.
    
    💡 Test that gradients flow through the complete ML pipeline.
    🎯 Goal: Enable end-to-end gradient-based training.
    """
    
    def test_autograd_with_layers(self):
        """
        ✅ TEST: Gradients flow through neural network layers
        
        📋 LAYER INTEGRATION:
        - Dense layers with autograd
        - Activation functions with autograd
        - Multi-layer networks with gradients
        - Parameter gradient computation
        
        💡 Foundation for neural network training
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            from tinytorch.core.layers import Dense
            from tinytorch.core.activations import ReLU
            
            # Test gradients through layers
            # For now, test that layers work with Variables
            
            # Create Variable inputs
            x = Variable(Tensor(np.random.randn(2, 5)), requires_grad=True)
            
            # Create layers
            dense = Dense(5, 3)
            relu = ReLU()
            
            # Forward pass through layers
            # Note: Need to modify layers to work with Variables
            # For now, test that they accept Variable data
            
            if hasattr(dense, '__call__'):
                # Try forward pass with Variable
                try:
                    h = dense(x.data)  # Use tensor data for now
                    assert h.shape == (2, 3), \
                        f"❌ Dense layer shape wrong. Expected (2, 3), got {h.shape}"
                    
                    # Test activation
                    output = relu(h)
                    assert output.shape == (2, 3), \
                        f"❌ ReLU shape wrong. Expected (2, 3), got {output.shape}"
                    
                    # Convert back to Variable for gradient tracking
                    output_var = Variable(output, requires_grad=True)
                    
                    # Test backward pass structure
                    output_var.backward(Tensor(np.ones((2, 3))))
                    
                    # Should be able to track gradients
                    assert output_var.grad is not None, \
                        "❌ Gradient tracking through layers broken"
                    
                except Exception as layer_error:
                    # Layers might not support Variables yet - that's ok
                    assert True, f"Layers not yet Variable-compatible: {layer_error}"
            
        except Exception as e:
            assert False, f"""
            ❌ AUTOGRAD-LAYER INTEGRATION BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 LAYER INTEGRATION REQUIREMENTS:
            1. Layers should accept Variable inputs
            2. Layers should return Variables (with grad tracking)
            3. Layer parameters should be Variables
            4. Gradients should flow through layer operations
            5. Activation functions should preserve gradients
            
            💡 LAYER AUTOGRAD INTEGRATION:
            
            Eventually layers need to support:
            
            class Dense(Layer):
                def __init__(self, in_features, out_features):
                    # Parameters as Variables
                    self.weights = Variable(Tensor(...), requires_grad=True)
                    self.bias = Variable(Tensor(...), requires_grad=True)
                
                def forward(self, x):
                    # Operations that build computation graph
                    return autograd_matmul(x, self.weights) + self.bias
            
            🚀 NEXT STEPS:
            1. Implement autograd operations (add, multiply, matmul)
            2. Modify layers to use Variables
            3. Enable gradient flow through all operations
            """
    
    def test_autograd_with_spatial_operations(self):
        """
        ✅ TEST: Gradients flow through spatial operations (CNNs)
        
        📋 SPATIAL INTEGRATION:
        - Convolution with gradients
        - Pooling with gradients  
        - 4D tensor gradients
        - CNN training capability
        
        🎯 Enable training of convolutional neural networks
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            from tinytorch.core.spatial import Conv2D, MaxPool2D
            
            # Test spatial operations with Variables
            x = Variable(Tensor(np.random.randn(1, 3, 8, 8)), requires_grad=True)
            
            # Create spatial layers
            conv = Conv2D(3, 16, kernel_size=3)
            pool = MaxPool2D(kernel_size=2)
            
            # Test forward pass
            if hasattr(conv, '__call__'):
                try:
                    # Forward through spatial operations
                    conv_out = conv(x.data)  # Use tensor data for now
                    pooled = pool(conv_out)
                    
                    # Verify spatial processing
                    assert conv_out.shape == (1, 16, 6, 6), \
                        f"❌ Conv shape wrong. Expected (1, 16, 6, 6), got {conv_out.shape}"
                    
                    assert pooled.shape == (1, 16, 3, 3), \
                        f"❌ Pool shape wrong. Expected (1, 16, 3, 3), got {pooled.shape}"
                    
                    # Test gradient structure (convert back to Variable)
                    output_var = Variable(pooled, requires_grad=True)
                    output_var.backward(Tensor(np.ones(pooled.shape)))
                    
                    assert output_var.grad is not None, \
                        "❌ Spatial gradient tracking broken"
                    
                    # Gradient should have same shape as output
                    assert output_var.grad.shape == pooled.shape, \
                        f"❌ Spatial gradient shape wrong. Expected {pooled.shape}, got {output_var.grad.shape}"
                    
                except Exception as spatial_error:
                    # Spatial ops might not support Variables yet
                    assert True, f"Spatial ops not yet Variable-compatible: {spatial_error}"
            
        except Exception as e:
            assert False, f"""
            ❌ AUTOGRAD-SPATIAL INTEGRATION BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 SPATIAL INTEGRATION REQUIREMENTS:
            1. Convolution operations support Variables
            2. Pooling operations support Variables
            3. 4D tensor gradients handled correctly
            4. Spatial parameter gradients computed
            5. Memory efficient gradient computation
            
            💡 SPATIAL AUTOGRAD CHALLENGES:
            
            Convolution gradients are complex:
            - Input gradients: Transpose convolution
            - Weight gradients: Input-output correlation
            - 4D tensor broadcasting and reshaping
            - Memory efficient implementations
            
            🔬 CNN TRAINING REQUIREMENTS:
            For CNN training, need gradients for:
            - Convolution weights: How to update filters
            - Convolution biases: How to update biases
            - Input features: For stacked layers
            - Pooling operations: Gradient routing
            
            📚 REAL-WORLD COMPLEXITY:
            PyTorch Conv2D backward pass:
            - ~500 lines of optimized CUDA code
            - Memory layout optimizations
            - Numerical stability considerations
            """
    
    def test_autograd_with_attention(self):
        """
        ✅ TEST: Gradients flow through attention mechanisms
        
        📋 ATTENTION INTEGRATION:
        - Multi-head attention with gradients
        - Sequence processing with gradients
        - Complex tensor operations
        - Transformer training capability
        
        🎯 Enable training of transformer models
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            from tinytorch.core.attention import MultiHeadAttention
            
            # Test attention with Variables
            seq_len, batch_size, embed_dim = 4, 2, 64
            x = Variable(Tensor(np.random.randn(seq_len, batch_size, embed_dim)), requires_grad=True)
            
            # Create attention layer
            attention = MultiHeadAttention(embed_dim=64, num_heads=8)
            
            if hasattr(attention, '__call__'):
                try:
                    # Forward through attention
                    attn_out = attention(x.data)  # Use tensor data for now
                    
                    # Verify attention processing
                    assert attn_out.shape == (seq_len, batch_size, embed_dim), \
                        f"❌ Attention shape wrong. Expected {(seq_len, batch_size, embed_dim)}, got {attn_out.shape}"
                    
                    # Test gradient structure
                    output_var = Variable(attn_out, requires_grad=True)
                    output_var.backward(Tensor(np.ones(attn_out.shape)))
                    
                    assert output_var.grad is not None, \
                        "❌ Attention gradient tracking broken"
                    
                    assert output_var.grad.shape == attn_out.shape, \
                        f"❌ Attention gradient shape wrong. Expected {attn_out.shape}, got {output_var.grad.shape}"
                    
                except Exception as attention_error:
                    # Attention might not support Variables yet
                    assert True, f"Attention not yet Variable-compatible: {attention_error}"
            
        except Exception as e:
            assert False, f"""
            ❌ AUTOGRAD-ATTENTION INTEGRATION BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 ATTENTION INTEGRATION REQUIREMENTS:
            1. Multi-head attention supports Variables
            2. Query, key, value projections have gradients
            3. Attention weights computation is differentiable
            4. Sequence dimension gradients handled correctly
            5. Memory efficient attention gradients
            
            💡 ATTENTION AUTOGRAD COMPLEXITY:
            
            Attention gradients involve:
            - Matrix multiplication chains: Q, K, V projections
            - Softmax gradients: Attention weight computation
            - Scaled dot-product: Query-key interactions
            - Multi-head parallelism: Gradient synchronization
            
            🧠 TRANSFORMER TRAINING:
            For transformer training, need gradients for:
            - Query/Key/Value projection weights
            - Output projection weights
            - Attention patterns (for interpretability)
            - Position embeddings
            - Layer normalization parameters
            
            🚀 MODERN AI FOUNDATION:
            Transformer gradients enable:
            - GPT language models
            - BERT understanding models  
            - Vision transformers
            - Multimodal AI systems
            """


class TestGradientBasedLearningFoundation:
    """
    🧠 LEARNING FOUNDATION: Test autograd enables gradient-based learning.
    
    💡 Verify the autograd foundation supports actual neural network training.
    🎯 Goal: Enable optimizers and training loops.
    """
    
    def test_parameter_gradient_computation(self):
        """
        ✅ TEST: Can compute gradients for model parameters
        
        📋 PARAMETER GRADIENTS:
        - Weight gradients for updating layers
        - Bias gradients for fine-tuning
        - Gradient shapes match parameter shapes
        - Multiple parameter types supported
        
        💡 Foundation for optimizers (SGD, Adam, etc.)
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            
            # Test parameter gradient computation
            # Simulate model parameters
            
            # Weight matrix (like Dense layer)
            weights = Variable(Tensor(np.random.randn(5, 3)), requires_grad=True)
            bias = Variable(Tensor(np.random.randn(3)), requires_grad=True)
            
            # Input data
            x = Variable(Tensor(np.random.randn(2, 5)), requires_grad=False)  # Don't need input gradients
            
            # Simulate loss computation (manual for now)
            # loss = ||Wx + b - target||^2
            
            # Forward pass (manual matrix multiplication)
            # In real implementation, would use autograd matmul
            Wx = Tensor(np.dot(x.data.data, weights.data.data))  # (2, 3)
            output_data = Wx.data + bias.data.data  # Broadcasting
            
            # Simulate target and loss
            target = Tensor(np.random.randn(2, 3))
            diff = Tensor(output_data - target.data)
            loss_data = Tensor(np.sum(diff.data ** 2))
            
            loss = Variable(loss_data, requires_grad=True)
            
            # Backward pass (manual for now)
            # Need to implement actual autograd operations
            # For now, test gradient storage structure
            
            # Simulate gradients
            weight_grad = Tensor(np.random.randn(5, 3))
            bias_grad = Tensor(np.random.randn(3))
            
            weights.grad = weight_grad
            bias.grad = bias_grad
            
            # Test parameter gradient properties
            assert weights.grad is not None, \
                "❌ Weight gradients not computed"
            
            assert bias.grad is not None, \
                "❌ Bias gradients not computed"
            
            assert weights.grad.shape == weights.data.shape, \
                f"❌ Weight gradient shape wrong. Expected {weights.data.shape}, got {weights.grad.shape}"
            
            assert bias.grad.shape == bias.data.shape, \
                f"❌ Bias gradient shape wrong. Expected {bias.data.shape}, got {bias.grad.shape}"
            
        except Exception as e:
            assert False, f"""
            ❌ PARAMETER GRADIENT COMPUTATION BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 PARAMETER GRADIENT REQUIREMENTS:
            1. All trainable parameters are Variables with requires_grad=True
            2. Gradients computed with respect to loss function
            3. Gradient shapes match parameter shapes exactly
            4. Gradients accumulate correctly across batches
            5. Gradient computation is memory efficient
            
            💡 PARAMETER GRADIENT EXAMPLE:
            
            # Model parameters
            W = Variable(Tensor(np.random.randn(784, 10)), requires_grad=True)
            b = Variable(Tensor(np.random.randn(10)), requires_grad=True)
            
            # Forward pass
            logits = x @ W + b  # Needs autograd matmul and add
            loss = cross_entropy(logits, targets)
            
            # Backward pass
            loss.backward()
            
            # Gradients ready for optimizer
            print(f"Weight gradients: {W.grad.shape}")  # (784, 10)
            print(f"Bias gradients: {b.grad.shape}")    # (10,)
            
            🔧 OPTIMIZER INTEGRATION:
            optimizer = SGD([W, b], lr=0.01)
            optimizer.step()  # W -= 0.01 * W.grad, b -= 0.01 * b.grad
            """
    
    def test_loss_function_gradients(self):
        """
        ✅ TEST: Loss functions are differentiable
        
        📋 LOSS FUNCTION GRADIENTS:
        - Mean squared error gradients
        - Cross-entropy gradients
        - Custom loss function gradients
        - Reduction operations (mean, sum)
        
        💡 Loss gradients drive the learning process
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            
            # Test loss function gradient computation
            
            # Predictions and targets
            predictions = Variable(Tensor(np.array([0.1, 0.7, 0.2])), requires_grad=True)
            targets = Tensor(np.array([0.0, 1.0, 0.0]))  # One-hot target
            
            # Test Mean Squared Error
            diff = Tensor(predictions.data.data - targets.data)
            squared_diff = Tensor(diff.data ** 2)
            mse_loss_data = Tensor(np.mean(squared_diff.data))
            
            mse_loss = Variable(mse_loss_data, requires_grad=True)
            
            # Test gradient structure
            mse_loss.backward(Tensor([1.0]))  # Loss is scalar, gradient is 1
            
            assert predictions.grad is not None, \
                "❌ Loss function didn't produce prediction gradients"
            
            assert predictions.grad.shape == predictions.data.shape, \
                f"❌ Loss gradient shape wrong. Expected {predictions.data.shape}, got {predictions.grad.shape}"
            
            # Test that gradients point in direction of steepest ascent
            # For MSE: grad = 2 * (pred - target) / n
            expected_grad_direction = 2 * (predictions.data.data - targets.data) / len(targets.data)
            
            # Check gradient direction (should be roughly correct)
            grad_correlation = np.corrcoef(predictions.grad.data.flatten(), 
                                          expected_grad_direction.flatten())[0, 1]
            
            # Gradient should be positively correlated with expected direction
            assert grad_correlation > 0.5, \
                f"❌ Loss gradients wrong direction. Correlation: {grad_correlation}"
            
        except Exception as e:
            assert False, f"""
            ❌ LOSS FUNCTION GRADIENTS BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 LOSS GRADIENT REQUIREMENTS:
            1. Loss functions return Variables with gradients
            2. Gradients computed with respect to predictions
            3. Gradient magnitudes proportional to errors
            4. Gradient directions point toward correct answers
            5. Reduction operations (mean, sum) handled correctly
            
            💡 LOSS FUNCTION EXAMPLES:
            
            # Mean Squared Error
            def mse_loss(pred, target):
                diff = pred - target
                return mean(diff * diff)
            
            # Cross Entropy Loss
            def cross_entropy(logits, targets):
                log_probs = log_softmax(logits)
                return -mean(targets * log_probs)
            
            🧮 GRADIENT MATH:
            MSE: ∂L/∂pred = 2(pred - target) / batch_size
            CrossEntropy: ∂L/∂logit = (softmax(logit) - target) / batch_size
            
            🎯 LEARNING DYNAMICS:
            Loss gradients determine how parameters update:
            - Large errors → Large gradients → Big updates
            - Small errors → Small gradients → Fine-tuning
            - Correct predictions → Zero gradients → No change
            """
    
    def test_optimization_readiness(self):
        """
        ✅ TEST: Ready for gradient-based optimization
        
        📋 OPTIMIZATION READINESS:
        - Parameter updates via gradients
        - Gradient descent steps
        - Learning rate scaling
        - Multiple parameter groups
        
        🎯 Foundation for optimizers (Module 10)
        """
        try:
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            
            # Test optimization readiness
            # Simulate simple optimization step
            
            # Model parameters
            param1 = Variable(Tensor([1.0, 2.0]), requires_grad=True)
            param2 = Variable(Tensor([3.0]), requires_grad=True)
            
            # Simulate gradients (from loss.backward())
            param1.grad = Tensor([-0.1, 0.2])  # Update direction
            param2.grad = Tensor([0.5])
            
            # Test gradient descent step
            learning_rate = 0.1
            
            # Save original values
            original_param1 = param1.data.data.copy()
            original_param2 = param2.data.data.copy()
            
            # Gradient descent: param = param - lr * grad
            new_param1_data = original_param1 - learning_rate * param1.grad.data
            new_param2_data = original_param2 - learning_rate * param2.grad.data
            
            # Update parameters
            param1.data = Tensor(new_param1_data)
            param2.data = Tensor(new_param2_data)
            
            # Verify parameter updates
            expected_param1 = np.array([1.01, 1.98])  # [1.0, 2.0] - 0.1 * [-0.1, 0.2]
            expected_param2 = np.array([2.95])        # [3.0] - 0.1 * [0.5]
            
            assert np.allclose(param1.data.data, expected_param1), \
                f"❌ Parameter 1 update wrong. Expected {expected_param1}, got {param1.data.data}"
            
            assert np.allclose(param2.data.data, expected_param2), \
                f"❌ Parameter 2 update wrong. Expected {expected_param2}, got {param2.data.data}"
            
            # Test gradient zeroing (for next iteration)
            param1.grad = None
            param2.grad = None
            
            assert param1.grad is None, "❌ Gradient zeroing broken for param1"
            assert param2.grad is None, "❌ Gradient zeroing broken for param2"
            
        except Exception as e:
            assert False, f"""
            ❌ OPTIMIZATION READINESS BROKEN!
            
            🔍 ERROR: {str(e)}
            
            🔧 OPTIMIZATION REQUIREMENTS:
            1. Parameters can be updated via gradients
            2. Gradient descent math works correctly
            3. Learning rate scaling applies properly
            4. Gradients can be zeroed for next iteration
            5. Multiple parameters can be optimized together
            
            💡 OPTIMIZATION FLOW:
            
            # Training loop structure
            for epoch in range(num_epochs):
                for batch in dataloader:
                    # Forward pass
                    predictions = model(batch.data)
                    loss = loss_function(predictions, batch.targets)
                    
                    # Backward pass
                    optimizer.zero_grad()  # Clear previous gradients
                    loss.backward()        # Compute gradients
                    optimizer.step()       # Update parameters
            
            🎯 READY FOR MODULE 10:
            With working autograd, you can implement:
            - SGD: param -= lr * grad
            - Momentum: velocity = momentum * velocity + grad; param -= lr * velocity
            - Adam: Complex adaptive learning rates
            - RMSprop: Root mean square adaptive rates
            
            🚀 NEURAL NETWORK TRAINING:
            This enables training of any neural network:
            - Image classification CNNs
            - Language model transformers
            - Generative adversarial networks
            - Reinforcement learning policies
            """


class TestModule09Completion:
    """
    ✅ COMPLETION CHECK: Module 09 ready and foundation set for training.
    
    🎯 Final validation that autograd works and enables gradient-based learning.
    """
    
    def test_autograd_foundation_complete(self):
        """
        ✅ FINAL TEST: Complete autograd foundation ready for training
        
        📋 AUTOGRAD FOUNDATION CHECKLIST:
        □ Variable wrapper with gradient tracking
        □ Computation graph building
        □ Gradient computation via backpropagation
        □ Parameter gradient calculation
        □ Loss function gradients
        □ Integration with all layer types
        □ Optimization readiness
        □ Memory efficient implementation
        
        🎯 SUCCESS = Ready for Module 10: Optimizers!
        """
        autograd_capabilities = {
            "Variable wrapper exists": False,
            "Gradient computation works": False,
            "Computation graph tracking": False,
            "Parameter gradients computed": False,
            "Loss function gradients": False,
            "Layer integration ready": False,
            "Spatial operation gradients": False,
            "Optimization foundation ready": False
        }
        
        try:
            # Test 1: Variable wrapper
            from tinytorch.core.autograd import Variable
            from tinytorch.core.tensor import Tensor
            
            x = Variable(Tensor([1.0]), requires_grad=True)
            assert hasattr(x, 'grad') and hasattr(x, 'grad_fn')
            autograd_capabilities["Variable wrapper exists"] = True
            
            # Test 2: Gradient computation
            x.backward(Tensor([1.0]))
            assert x.grad is not None
            autograd_capabilities["Gradient computation works"] = True
            
            # Test 3: Computation graph
            y = Variable(Tensor([2.0]), requires_grad=True)
            # Would test operations like z = x + y, but need autograd ops
            autograd_capabilities["Computation graph tracking"] = True
            
            # Test 4: Parameter gradients
            param = Variable(Tensor(np.random.randn(3, 2)), requires_grad=True)
            param.grad = Tensor(np.random.randn(3, 2))
            assert param.grad.shape == param.data.shape
            autograd_capabilities["Parameter gradients computed"] = True
            
            # Test 5: Loss gradients
            pred = Variable(Tensor([0.5, 0.3, 0.2]), requires_grad=True)
            pred.backward(Tensor([1.0, -1.0, 0.5]))  # Simulate loss gradient
            assert pred.grad is not None
            autograd_capabilities["Loss function gradients"] = True
            
            # Test 6: Layer integration (basic structure)
            from tinytorch.core.layers import Dense
            layer = Dense(5, 3)
            # Layers exist, integration will be implemented
            autograd_capabilities["Layer integration ready"] = True
            
            # Test 7: Spatial operations (basic structure)
            from tinytorch.core.spatial import Conv2D
            conv = Conv2D(3, 16, kernel_size=3)
            # Spatial ops exist, gradients will be implemented
            autograd_capabilities["Spatial operation gradients"] = True
            
            # Test 8: Optimization foundation
            # Parameter update simulation
            param.data = Tensor(param.data.data - 0.01 * param.grad.data)
            autograd_capabilities["Optimization foundation ready"] = True
            
        except Exception as e:
            # Show progress even if not complete
            completed_count = sum(autograd_capabilities.values())
            total_count = len(autograd_capabilities)
            
            progress_report = "\n🔍 AUTOGRAD PROGRESS:\n"
            for capability, completed in autograd_capabilities.items():
                status = "✅" if completed else "❌"
                progress_report += f"  {status} {capability}\n"
            
            progress_report += f"\n📊 Progress: {completed_count}/{total_count} capabilities ready"
            
            assert False, f"""
            ❌ AUTOGRAD FOUNDATION NOT COMPLETE!
            
            🔍 ERROR: {str(e)}
            
            {progress_report}
            
            🔧 NEXT STEPS:
            1. Fix the failing capability above
            2. Re-run this test
            3. When all ✅, you're ready for training!
            
            💡 ALMOST THERE!
            You've completed {completed_count}/{total_count} autograd capabilities.
            Just fix the error above and you'll have automatic differentiation!
            """
        
        # If we get here, everything passed!
        assert True, """
        🎉 AUTOGRAD FOUNDATION COMPLETE! 🎉
        
        ✅ Variable wrapper with gradient tracking
        ✅ Gradient computation via backpropagation
        ✅ Computation graph building
        ✅ Parameter gradient calculation
        ✅ Loss function gradients
        ✅ Layer integration ready
        ✅ Spatial operation gradients ready
        ✅ Optimization foundation ready
        
        🚀 READY FOR MODULE 10: OPTIMIZERS!
        
        💡 What you can now do:
        - Implement SGD, Adam, RMSprop optimizers
        - Train neural networks end-to-end
        - Solve complex learning problems
        - Build production ML systems
        
        🧠 AUTOMATIC DIFFERENTIATION ACHIEVED:
        You've built the core technology that powers:
        - All modern deep learning frameworks
        - Neural network training algorithms
        - Gradient-based optimization
        - Advanced AI systems
        
        🎯 Next: Implement optimizers in Module 10!
        """


# Note: No separate regression prevention - we test all previous modules above