"""
Integration Tests - Attention Pipeline

Tests cross-module pipeline interfaces and compatibility.
Focuses on how attention integrates with other TinyTorch modules to build complete workflows.
"""

import pytest
import numpy as np
from test_utils import setup_integration_test

# Ensure proper setup before importing
setup_integration_test()

# Import ONLY from TinyTorch package
from tinytorch.core.tensor import Tensor
from tinytorch.core.attention import scaled_dot_product_attention, SelfAttention, create_causal_mask
from tinytorch.core.layers import Dense
from tinytorch.core.activations import ReLU, Softmax
from tinytorch.core.dense import Sequential


class TestAttentionDensePipelineInterface:
    """Test interface compatibility between Attention and Dense modules."""
    
    def test_attention_output_to_dense_input(self):
        """Test that attention output can be used as Dense layer input."""
        seq_len, d_model = 6, 16
        
        # Create attention and dense components
        self_attn = SelfAttention(d_model)
        dense = Dense(input_size=d_model, output_size=10)
        
        # Create input
        x = Tensor(np.random.randn(seq_len, d_model))
        
        # Test pipeline interface: Attention → Dense
        attn_output, _ = self_attn(x.data)
        
        # Test that attention output can feed into dense layer
        for i in range(seq_len):
            pos_input = Tensor(attn_output[i:i+1])  # Single position
            dense_output = dense(pos_input)
            
            # Verify interface compatibility
            assert isinstance(dense_output, Tensor), "Dense should accept attention output as Tensor"
            assert dense_output.shape == (1, 10), "Dense should process attention output correctly"
    
    def test_attention_sequential_compatibility(self):
        """Test that attention can be integrated into Sequential pipelines."""
        d_model = 8
        
        # Test if we can build: Tensor → Dense → Attention-style processing
        input_tensor = Tensor(np.random.randn(4, 6))
        
        # Step 1: Dense layer to project to d_model
        projection = Dense(input_size=6, output_size=d_model)
        projected = projection(input_tensor)
        
        # Step 2: Attention processing (simulating attention in pipeline)
        self_attn = SelfAttention(d_model)
        attn_output, _ = self_attn(projected.data)
        
        # Step 3: Back to Dense layer
        output_projection = Dense(input_size=d_model, output_size=3)
        final_outputs = []
        for i in range(4):
            pos_input = Tensor(attn_output[i:i+1])
            pos_output = output_projection(pos_input)
            final_outputs.append(pos_output.data)
        
        final_result = np.concatenate(final_outputs, axis=0)
        
        # Verify pipeline interface works
        assert final_result.shape == (4, 3), "Complete pipeline should work"
        assert not np.any(np.isnan(final_result)), "Pipeline should produce valid outputs"
    
    def test_attention_with_activation_integration(self):
        """Test attention integration with activation functions."""
        seq_len, d_model = 5, 12
        
        # Create components
        self_attn = SelfAttention(d_model)
        relu = ReLU()
        dense = Dense(input_size=d_model, output_size=d_model)
        
        # Test pipeline: Input → Attention → Activation → Dense
        x = Tensor(np.random.randn(seq_len, d_model))
        
        # Attention step
        attn_output, _ = self_attn(x.data)
        
        # Process each position through activation and dense
        for i in range(seq_len):
            # Attention → Tensor → Activation → Dense pipeline
            pos_tensor = Tensor(attn_output[i:i+1])
            activated = relu(pos_tensor)
            dense_output = dense(activated)
            
            # Verify cross-module interface
            assert isinstance(activated, Tensor), "Activation should work with attention output"
            assert isinstance(dense_output, Tensor), "Dense should work after activation"
            assert dense_output.shape == (1, d_model), "Pipeline should preserve expected shapes"


class TestAttentionMultiModuleWorkflows:
    """Test attention in multi-module workflows and architectures."""
    
    def test_encoder_decoder_interface_pattern(self):
        """Test encoder-decoder pattern using multiple TinyTorch modules."""
        src_len, tgt_len, d_model = 6, 4, 16
        
        # Source processing (encoder-style)
        src = Tensor(np.random.randn(src_len, d_model))
        src_projection = Dense(input_size=d_model, output_size=d_model)
        src_projected = src_projection(src)
        
        encoder_attn = SelfAttention(d_model)
        encoded, _ = encoder_attn(src_projected.data)
        
        # Target processing (decoder-style)
        tgt = Tensor(np.random.randn(tgt_len, d_model))
        tgt_projection = Dense(input_size=d_model, output_size=d_model)
        tgt_projected = tgt_projection(tgt)
        
        # Cross-attention interface test
        cross_output, _ = scaled_dot_product_attention(
            tgt_projected.data,  # Queries from target
            encoded,            # Keys from encoder
            encoded             # Values from encoder
        )
        
        # Final processing
        output_projection = Dense(input_size=d_model, output_size=10)
        final_outputs = []
        for i in range(tgt_len):
            pos_input = Tensor(cross_output[i:i+1])
            pos_output = output_projection(pos_input)
            final_outputs.append(pos_output.data)
        
        final_result = np.concatenate(final_outputs, axis=0)
        
        # Verify multi-module workflow
        assert final_result.shape == (tgt_len, 10), "Encoder-decoder workflow should work"
        assert not np.any(np.isnan(final_result)), "Multi-module workflow should be stable"
    
    def test_multi_layer_attention_with_residuals(self):
        """Test multi-layer attention with residual connections using multiple modules."""
        seq_len, d_model = 8, 20
        num_layers = 3
        
        # Initial processing
        x = Tensor(np.random.randn(seq_len, d_model))
        embedding_projection = Dense(input_size=d_model, output_size=d_model)
        current_repr = embedding_projection(x).data
        
        # Multi-layer processing with residuals
        for layer in range(num_layers):
            # Self-attention
            attn = SelfAttention(d_model)
            attn_output, _ = attn(current_repr)
            
            # Feedforward network (using Dense layers)
            ff_network = Sequential([
                Dense(input_size=d_model, output_size=d_model * 2),
                ReLU(),
                Dense(input_size=d_model * 2, output_size=d_model)
            ])
            
            # Process each position through feedforward
            ff_outputs = []
            for i in range(seq_len):
                pos_input = Tensor(attn_output[i:i+1])
                pos_output = ff_network(pos_input)
                ff_outputs.append(pos_output.data)
            
            ff_result = np.concatenate(ff_outputs, axis=0)
            
            # Residual connection (attention + feedforward)
            current_repr = attn_output + ff_result
        
        # Verify multi-layer integration
        assert current_repr.shape == (seq_len, d_model), "Multi-layer should preserve shape"
        assert not np.any(np.isnan(current_repr)), "Multi-layer integration should be stable"
    
    def test_attention_classification_pipeline(self):
        """Test attention in classification pipeline with multiple modules."""
        seq_len, d_model, num_classes = 10, 24, 5
        
        # Input processing
        sentence = Tensor(np.random.randn(seq_len, d_model))
        input_projection = Dense(input_size=d_model, output_size=d_model)
        projected_input = input_projection(sentence)
        
        # Attention processing
        self_attn = SelfAttention(d_model)
        attended_seq, _ = self_attn(projected_input.data)
        
        # Global pooling (sequence → single representation)
        pooled_repr = np.mean(attended_seq, axis=0, keepdims=True)
        
        # Classification head (using Sequential)
        classifier = Sequential([
            Dense(input_size=d_model, output_size=d_model // 2),
            ReLU(),
            Dense(input_size=d_model // 2, output_size=num_classes)
        ])
        
        # Final classification
        pooled_tensor = Tensor(pooled_repr)
        class_scores = classifier(pooled_tensor)
        
        # Verify classification pipeline
        assert class_scores.shape == (1, num_classes), "Classification pipeline should work"
        assert isinstance(class_scores, Tensor), "Pipeline should produce Tensor output"


class TestAttentionDataFlowCompatibility:
    """Test data flow compatibility between attention and other modules."""
    
    def test_shape_preservation_across_modules(self):
        """Test that shapes flow correctly between attention and other modules."""
        batch_configs = [
            (4, 8),    # Small sequence
            (16, 32),  # Medium sequence
            (8, 64),   # Large model dimension
        ]
        
        for seq_len, d_model in batch_configs:
            # Input
            x = Tensor(np.random.randn(seq_len, d_model))
            
            # Processing pipeline
            input_proj = Dense(input_size=d_model, output_size=d_model)
            projected = input_proj(x)
            
            attn = SelfAttention(d_model)
            attn_out, _ = attn(projected.data)
            
            output_proj = Dense(input_size=d_model, output_size=d_model // 2)
            
            # Test shape flow
            for i in range(seq_len):
                pos_tensor = Tensor(attn_out[i:i+1])
                final_out = output_proj(pos_tensor)
                
                # Verify shape compatibility
                assert final_out.shape == (1, d_model // 2), f"Shape flow failed for config {(seq_len, d_model)}"
    
    def test_dtype_preservation_across_modules(self):
        """Test that data types are preserved across attention and other modules."""
        seq_len, d_model = 6, 16
        
        # Test float32 flow
        x_f32 = Tensor(np.random.randn(seq_len, d_model).astype(np.float32))
        
        dense_f32 = Dense(input_size=d_model, output_size=d_model)
        projected_f32 = dense_f32(x_f32)
        
        attn_f32 = SelfAttention(d_model)
        attn_out_f32, _ = attn_f32(projected_f32.data)
        
        # Verify dtype flow
        assert projected_f32.dtype == np.float32, "Dense should preserve float32"
        assert attn_out_f32.dtype == np.float32, "Attention should preserve float32"
        
        # Test conversion back to Tensor
        result_tensor_f32 = Tensor(attn_out_f32)
        assert result_tensor_f32.dtype == np.float32, "Tensor creation should preserve float32"
    
    def test_error_handling_across_modules(self):
        """Test error handling when modules are incompatibly connected."""
        # Test dimension mismatch between attention and dense
        seq_len = 4
        attn_dim = 8
        dense_dim = 16  # Intentional mismatch
        
        x = Tensor(np.random.randn(seq_len, attn_dim))
        attn = SelfAttention(attn_dim)
        attn_out, _ = attn(x.data)
        
        # This should fail gracefully
        incompatible_dense = Dense(input_size=dense_dim, output_size=10)
        
        try:
            pos_tensor = Tensor(attn_out[0:1])  # Shape (1, 8)
            result = incompatible_dense(pos_tensor)  # Expects (1, 16)
            assert False, "Should have failed with dimension mismatch"
        except (ValueError, AssertionError, TypeError) as e:
            # Expected behavior - should fail with clear error
            assert isinstance(e, (ValueError, AssertionError, TypeError)), "Should fail gracefully with incompatible dimensions"


class TestAttentionSystemLevelIntegration:
    """Test system-level integration scenarios."""
    
    def test_complete_transformer_block_simulation(self):
        """Test simulation of complete transformer block using TinyTorch modules."""
        seq_len, d_model = 8, 32
        
        # Input
        x = Tensor(np.random.randn(seq_len, d_model))
        
        # Transformer block simulation
        # 1. Self-attention
        self_attn = SelfAttention(d_model)
        attn_out, _ = self_attn(x.data)
        
        # 2. Residual connection (attention + input)
        attn_residual = attn_out + x.data
        
        # 3. Feedforward network
        ff_net = Sequential([
            Dense(input_size=d_model, output_size=d_model * 4),
            ReLU(),
            Dense(input_size=d_model * 4, output_size=d_model)
        ])
        
        # Process each position through feedforward
        ff_outputs = []
        for i in range(seq_len):
            pos_input = Tensor(attn_residual[i:i+1])
            pos_output = ff_net(pos_input)
            ff_outputs.append(pos_output.data)
        
        ff_result = np.concatenate(ff_outputs, axis=0)
        
        # 4. Second residual connection
        final_output = attn_residual + ff_result
        
        # Verify complete transformer block simulation
        assert final_output.shape == (seq_len, d_model), "Transformer block should preserve shape"
        assert not np.any(np.isnan(final_output)), "Transformer block should be stable"
        
        # Test that output can be used for next layer
        next_attn = SelfAttention(d_model)
        next_out, _ = next_attn(final_output)
        assert next_out.shape == (seq_len, d_model), "Should be stackable"
    
    def test_modular_component_replacement(self):
        """Test that attention components can be replaced modularly."""
        seq_len, d_model = 6, 16
        
        x = Tensor(np.random.randn(seq_len, d_model))
        
        # Pipeline with different attention configurations
        attention_variants = [
            SelfAttention(d_model),
            SelfAttention(d_model),  # Different instance
            SelfAttention(d_model),  # Another instance
        ]
        
        dense_postprocess = Dense(input_size=d_model, output_size=8)
        
        # Test that all variants work in same pipeline
        for i, attn_variant in enumerate(attention_variants):
            attn_out, _ = attn_variant(x.data)
            
            # Process first position
            pos_tensor = Tensor(attn_out[0:1])
            result = dense_postprocess(pos_tensor)
            
            # Verify modular replacement works
            assert result.shape == (1, 8), f"Attention variant {i} should work in pipeline"
            assert isinstance(result, Tensor), f"Attention variant {i} should produce Tensor output"


if __name__ == "__main__":
    pytest.main([__file__])