TinyTorch/tests/module_10/test_progressive_integration.py

"""
Module 07: Progressive Integration Tests
Tests that Module 07 (Attention) works correctly AND that the entire prior stack works.

DEPENDENCY CHAIN: 01_setup → 02_tensor → 03_activations → 04_layers → 05_dense → 06_spatial → 07_attention
This is where attention mechanisms enable sequence understanding.
"""

import numpy as np
import sys
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))


class TestPriorStackStillWorking:
    """Quick regression checks that prior modules (01→06) still work."""

    def test_foundation_stack_stable(self):
        """Verify foundation stack (01→05) remains stable."""
        # Environment (Module 01)
        assert sys.version_info >= (3, 8), "Foundation broken: Python version"

        # Tensor foundation (Module 02)
        try:
            from tinytorch.core.tensor import Tensor
            t = Tensor([1, 2, 3])
            assert t.shape == (3,), "Foundation broken: Tensor creation"
        except ImportError:
            assert True, "Tensor foundation not implemented yet"

    def test_spatial_operations_stable(self):
        """Verify Module 06 (Spatial) operations still work."""
        try:
            from tinytorch.core.spatial import Conv2D, MaxPool2D

            # Basic spatial operations should work
            conv = Conv2D(in_channels=3, out_channels=16, kernel_size=3)
            pool = MaxPool2D(kernel_size=2)

            assert hasattr(conv, 'forward'), "Spatial broken: Conv2D interface"
            assert hasattr(pool, 'forward'), "Spatial broken: MaxPool2D interface"

        except ImportError:
            assert True, "Spatial operations not implemented yet"


class TestModule07AttentionCore:
    """Test Module 07 (Attention) core functionality."""

    def test_attention_mechanism_creation(self):
        """Test basic attention mechanism works."""
        try:
            from tinytorch.core.attention import MultiHeadAttention
            from tinytorch.core.tensor import Tensor

            # Create attention mechanism
            attention = MultiHeadAttention(embed_dim=64, num_heads=8)

            # Should have proper components
            assert hasattr(attention, 'query_proj'), "Attention broken: No query projection"
            assert hasattr(attention, 'key_proj'), "Attention broken: No key projection"
            assert hasattr(attention, 'value_proj'), "Attention broken: No value projection"

            # Test with sequence input
            seq_len, batch_size, embed_dim = 10, 4, 64
            x = Tensor(np.random.randn(seq_len, batch_size, embed_dim))

            output = attention(x)
            assert output.shape == (seq_len, batch_size, embed_dim), "Attention output shape broken"

        except ImportError:
            assert True, "Attention mechanism not implemented yet"

    def test_scaled_dot_product_attention(self):
        """Test core attention computation."""
        try:
            from tinytorch.core.attention import scaled_dot_product_attention
            from tinytorch.core.tensor import Tensor

            # Attention inputs: queries, keys, values
            seq_len, embed_dim = 8, 16
            Q = Tensor(np.random.randn(seq_len, embed_dim))
            K = Tensor(np.random.randn(seq_len, embed_dim))
            V = Tensor(np.random.randn(seq_len, embed_dim))

            # Compute attention
            output, attention_weights = scaled_dot_product_attention(Q, K, V)

            assert output.shape == V.shape, "Attention output shape wrong"
            assert attention_weights.shape == (seq_len, seq_len), "Attention weights shape wrong"

            # Attention weights should sum to 1 across keys
            weight_sums = np.sum(attention_weights.data, axis=1)
            assert np.allclose(weight_sums, 1.0), "Attention weights don't sum to 1"

        except ImportError:
            assert True, "Scaled dot-product attention not implemented yet"


class TestProgressiveStackIntegration:
    """Test that the complete stack (01→07) works together."""

    def test_neural_network_with_attention(self):
        """Test neural network enhanced with attention."""
        try:
            from tinytorch.core.tensor import Tensor
            from tinytorch.core.layers import Dense
            from tinytorch.core.activations import ReLU
            from tinytorch.core.attention import MultiHeadAttention

            # Build network: dense → attention → dense
            encoder = Dense(64, 64)
            attention = MultiHeadAttention(embed_dim=64, num_heads=8)
            decoder = Dense(64, 10)
            relu = ReLU()

            # Sequence input
            seq_len, batch_size, input_dim = 12, 4, 64
            x = Tensor(np.random.randn(seq_len, batch_size, input_dim))

            # Forward pass through network with attention
            h = relu(encoder(x))        # Dense processing
            attn_out = attention(h)     # Attention mechanism
            output = decoder(attn_out)  # Final projection

            assert output.shape == (seq_len, batch_size, 10), "Network with attention broken"

        except ImportError:
            assert True, "Neural network with attention not ready yet"

    def test_transformer_block_capability(self):
        """Test building transformer-style blocks."""
        try:
            from tinytorch.core.attention import MultiHeadAttention
            from tinytorch.core.layers import Dense
            from tinytorch.core.activations import ReLU
            from tinytorch.core.tensor import Tensor

            # Transformer block components
            attention = MultiHeadAttention(embed_dim=128, num_heads=8)
            ff1 = Dense(128, 512)
            ff2 = Dense(512, 128)
            relu = ReLU()

            # Input sequence
            seq_len, batch_size, embed_dim = 16, 2, 128
            x = Tensor(np.random.randn(seq_len, batch_size, embed_dim))

            # Transformer block: attention + feedforward
            attn_out = attention(x)
            ff_out = ff2(relu(ff1(attn_out)))

            # Residual connection (if implemented)
            if hasattr(x, '__add__'):
                output = x + ff_out  # Residual connection
            else:
                output = ff_out

            assert output.shape == x.shape, "Transformer block broken"

        except ImportError:
            assert True, "Transformer block capability not ready yet"


class TestSequenceUnderstandingCapability:
    """Test that attention enables sequence understanding."""

    def test_sequence_to_sequence_capability(self):
        """Test sequence-to-sequence processing."""
        try:
            from tinytorch.core.attention import MultiHeadAttention
            from tinytorch.core.tensor import Tensor

            # Encoder-decoder style processing
            encoder_attention = MultiHeadAttention(embed_dim=64, num_heads=4)
            decoder_attention = MultiHeadAttention(embed_dim=64, num_heads=4)

            # Source and target sequences
            src_len, tgt_len, batch_size, embed_dim = 10, 8, 2, 64
            src = Tensor(np.random.randn(src_len, batch_size, embed_dim))
            tgt = Tensor(np.random.randn(tgt_len, batch_size, embed_dim))

            # Encode source sequence
            encoded = encoder_attention(src)

            # Decode target sequence (with potential cross-attention)
            if hasattr(decoder_attention, 'cross_attention'):
                decoded = decoder_attention(tgt, encoded)
            else:
                decoded = decoder_attention(tgt)

            assert encoded.shape == src.shape, "Sequence encoding broken"
            assert decoded.shape == tgt.shape, "Sequence decoding broken"

        except ImportError:
            assert True, "Sequence-to-sequence not ready yet"

    def test_attention_pattern_analysis(self):
        """Test that attention creates meaningful patterns."""
        try:
            from tinytorch.core.attention import scaled_dot_product_attention
            from tinytorch.core.tensor import Tensor

            # Create sequence with clear patterns
            seq_len, embed_dim = 6, 8

            # Pattern: first and last tokens should attend to each other
            pattern_input = np.zeros((seq_len, embed_dim))
            pattern_input[0, :] = 1.0  # First token
            pattern_input[-1, :] = 1.0  # Last token

            Q = Tensor(pattern_input)
            K = Tensor(pattern_input)
            V = Tensor(pattern_input)

            output, attention_weights = scaled_dot_product_attention(Q, K, V)

            # Check attention patterns make sense
            # First token should attend strongly to last token
            first_to_last = attention_weights.data[0, -1]
            last_to_first = attention_weights.data[-1, 0]

            # These should be among the highest attention weights
            assert first_to_last > 0.1, "Attention pattern not detected"
            assert last_to_first > 0.1, "Attention pattern not detected"

        except ImportError:
            assert True, "Attention pattern analysis not ready yet"


class TestNLPReadiness:
    """Test readiness for NLP applications."""

    def test_language_modeling_architecture(self):
        """Test architecture suitable for language modeling."""
        try:
            from tinytorch.core.attention import MultiHeadAttention
            from tinytorch.core.layers import Dense
            from tinytorch.core.tensor import Tensor

            # Language model components
            vocab_size, embed_dim, seq_len = 1000, 256, 32

            # Embedding layer (simplified)
            embedding = Dense(vocab_size, embed_dim)

            # Attention layers
            attention1 = MultiHeadAttention(embed_dim=embed_dim, num_heads=8)
            attention2 = MultiHeadAttention(embed_dim=embed_dim, num_heads=8)

            # Output projection
            output_proj = Dense(embed_dim, vocab_size)

            # Token sequence (as embeddings)
            batch_size = 4
            tokens = Tensor(np.random.randint(0, vocab_size, (seq_len, batch_size)))

            # Simple embedding lookup (simplified)
            if hasattr(embedding, 'embedding_lookup'):
                x = embedding.embedding_lookup(tokens)
            else:
                # Simplified: random embeddings
                x = Tensor(np.random.randn(seq_len, batch_size, embed_dim))

            # Transformer layers
            h1 = attention1(x)
            h2 = attention2(h1)

            # Output logits
            logits = output_proj(h2)

            assert logits.shape == (seq_len, batch_size, vocab_size), "Language model architecture broken"

        except ImportError:
            assert True, "Language modeling architecture not ready yet"


class TestRegressionPrevention:
    """Ensure previous modules still work after Module 07 development."""

    def test_no_foundation_regression(self):
        """Verify foundation stack (01→05) unchanged."""
        # Environment should remain stable
        assert sys.version_info.major >= 3, "Foundation: Python detection broken"

        # Project structure should remain intact
        project_root = Path(__file__).parent.parent.parent
        assert project_root.exists(), "Foundation: Project structure broken"

    def test_no_spatial_regression(self):
        """Verify spatial operations (Module 06) unchanged."""
        try:
            from tinytorch.core.spatial import Conv2D

            # Spatial operations should still work
            conv = Conv2D(in_channels=1, out_channels=8, kernel_size=3)
            assert hasattr(conv, 'forward'), "Spatial regression: Conv2D broken"

        except ImportError:
            # If not implemented, that's fine
            # But numpy should still work (from foundation)
            import numpy as np
            arr = np.array([1, 2, 3])
            assert arr.shape == (3,), "Spatial regression: Numpy foundation broken"

    def test_progressive_stability(self):
        """Test the progressive stack is stable through attention."""
        # Stack should be stable through: Setup → Tensor → Activations → Layers → Dense → Spatial → Attention

        # Setup level
        import numpy as np
        assert np is not None, "Setup level broken"

        # Foundation level (if available)
        try:
            from tinytorch.core.tensor import Tensor
            from tinytorch.core.layers import Dense

            # Should still be able to build neural networks
            layer = Dense(10, 5)
            x = Tensor(np.random.randn(4, 10))
            output = layer(x)
            assert output.shape == (4, 5), "Foundation level broken"

        except ImportError:
            pass  # Not implemented yet

        # Attention level (if available)
        try:
            from tinytorch.core.attention import MultiHeadAttention
            attention = MultiHeadAttention(embed_dim=32, num_heads=4)
            assert callable(attention), "Attention level broken"
        except ImportError:
            pass  # Not implemented yet