# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║                        🚨 CRITICAL WARNING 🚨                                ║
# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
# ║                                                                               ║
# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
# ║                                                                               ║
# ║  ✅ TO EDIT: src/XX_compression/XX_compression.py                   ║
# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
# ║                                                                               ║
# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
# ║     Editing it directly may break module functionality and training.         ║
# ║                                                                               ║
# ║  🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners)    ║
# ║     The tinytorch/ directory is generated code - edit source files instead!  ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'magnitude_prune', 'structured_prune', 'low_rank_approximate',
           'KnowledgeDistillation', 'CompressionComplete', 'measure_sparsity', 'compress_model']

# %% ../../modules/16_compression/16_compression.ipynb 1
import numpy as np
import copy
from typing import List, Dict, Any, Tuple, Optional
import time

# Import from TinyTorch package (previous modules must be completed and exported)
from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.activations import ReLU

# Constants for memory calculations
BYTES_PER_FLOAT32 = 4  # Standard float32 size in bytes
MB_TO_BYTES = 1024 * 1024  # Megabytes to bytes conversion

# %% ../../modules/16_compression/16_compression.ipynb 12
def magnitude_prune(model, sparsity=0.9):
    """
    Remove weights with smallest magnitudes to achieve target sparsity.

    TODO: Implement global magnitude-based pruning

    APPROACH:
    1. Collect all weights from the model
    2. Calculate absolute values to get magnitudes
    3. Find threshold at desired sparsity percentile
    4. Set weights below threshold to zero (in-place)

    EXAMPLE:
    >>> # Create model with explicit layer composition
    >>> layer1 = Linear(100, 50)
    >>> layer2 = Linear(50, 10)
    >>> model = SimpleModel(layer1, layer2)
    >>> original_params = sum(p.size for p in model.parameters())
    >>> magnitude_prune(model, sparsity=0.8)
    >>> final_sparsity = measure_sparsity(model)
    >>> print(f"Achieved {final_sparsity:.1f}% sparsity")
    Achieved 80.0% sparsity

    HINTS:
    - Use np.percentile() to find threshold
    - Modify model parameters in-place
    - Consider only weight matrices, not biases
    """
    ### BEGIN SOLUTION
    # Collect all weights (excluding biases)
    all_weights = []
    weight_params = []

    for param in model.parameters():
        # Skip biases (typically 1D)
        if len(param.shape) > 1:
            all_weights.extend(param.data.flatten())
            weight_params.append(param)

    if not all_weights:
        return model

    # Calculate magnitude threshold
    magnitudes = np.abs(all_weights)
    threshold = np.percentile(magnitudes, sparsity * 100)

    # Apply pruning to each weight parameter
    for param in weight_params:
        mask = np.abs(param.data) >= threshold
        param.data = param.data * mask

    return model
    ### END SOLUTION

# %% ../../modules/16_compression/16_compression.ipynb 15
def structured_prune(model, prune_ratio=0.5):
    """
    Remove entire channels/neurons based on L2 norm importance.

    TODO: Implement structured pruning for Linear layers

    APPROACH:
    1. For each Linear layer, calculate L2 norm of each output channel
    2. Rank channels by importance (L2 norm)
    3. Remove lowest importance channels by setting to zero
    4. This creates block sparsity that's hardware-friendly

    EXAMPLE:
    >>> # Create model with explicit layers
    >>> layer1 = Linear(100, 50)
    >>> layer2 = Linear(50, 10)
    >>> model = SimpleModel(layer1, layer2)
    >>> original_shape = layer1.weight.shape
    >>> structured_prune(model, prune_ratio=0.3)
    >>> # 30% of channels are now completely zero
    >>> final_sparsity = measure_sparsity(model)
    >>> print(f"Structured sparsity: {final_sparsity:.1f}%")
    Structured sparsity: 30.0%

    HINTS:
    - Calculate L2 norm along input dimension for each output channel
    - Use np.linalg.norm(weights[:, channel]) for channel importance
    - Set entire channels to zero (not just individual weights)
    """
    ### BEGIN SOLUTION
    # All Linear layers have .weight attribute
    for layer in model.layers:
        if isinstance(layer, Linear):
            weight = layer.weight.data

            # Calculate L2 norm for each output channel (column)
            channel_norms = np.linalg.norm(weight, axis=0)

            # Find channels to prune (lowest importance)
            num_channels = weight.shape[1]
            num_to_prune = int(num_channels * prune_ratio)

            if num_to_prune > 0:
                # Get indices of channels to prune (smallest norms)
                prune_indices = np.argpartition(channel_norms, num_to_prune)[:num_to_prune]

                # Zero out entire channels
                weight[:, prune_indices] = 0

                # Also zero corresponding bias elements if bias exists
                if layer.bias is not None:
                    layer.bias.data[prune_indices] = 0

    return model
    ### END SOLUTION

# %% ../../modules/16_compression/16_compression.ipynb 18
def low_rank_approximate(weight_matrix, rank_ratio=0.5):
    """
    Approximate weight matrix using low-rank decomposition (SVD).

    TODO: Implement SVD-based low-rank approximation

    APPROACH:
    1. Perform SVD: W = U @ S @ V^T
    2. Keep only top k singular values where k = rank_ratio * min(dimensions)
    3. Reconstruct: W_approx = U[:,:k] @ diag(S[:k]) @ V[:k,:]
    4. Return decomposed matrices for memory savings

    EXAMPLE:
    >>> weight = np.random.randn(100, 50)
    >>> U, S, V = low_rank_approximate(weight, rank_ratio=0.3)
    >>> # Original: 100*50 = 5000 params
    >>> # Compressed: 100*15 + 15*50 = 2250 params (55% reduction)

    HINTS:
    - Use np.linalg.svd() for decomposition
    - Choose k = int(rank_ratio * min(m, n))
    - Return U[:,:k], S[:k], V[:k,:] for reconstruction
    """
    ### BEGIN SOLUTION
    m, n = weight_matrix.shape

    # Perform SVD
    U, S, V = np.linalg.svd(weight_matrix, full_matrices=False)

    # Determine target rank
    max_rank = min(m, n)
    target_rank = max(1, int(rank_ratio * max_rank))

    # Truncate to target rank
    U_truncated = U[:, :target_rank]
    S_truncated = S[:target_rank]
    V_truncated = V[:target_rank, :]

    return U_truncated, S_truncated, V_truncated
    ### END SOLUTION

# %% ../../modules/16_compression/16_compression.ipynb 21
class KnowledgeDistillation:
    """
    Knowledge distillation for model compression.

    Train a smaller student model to mimic a larger teacher model.
    """

    def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7):
        """
        Initialize knowledge distillation.

        TODO: Set up teacher and student models with distillation parameters

        APPROACH:
        1. Store teacher and student models
        2. Set temperature for softening probability distributions
        3. Set alpha for balancing hard vs soft targets

        EXAMPLE:
        >>> # Create teacher with more capacity (explicit layers)
        >>> teacher_l1 = Linear(100, 200)
        >>> teacher_l2 = Linear(200, 50)
        >>> teacher = SimpleModel(teacher_l1, teacher_l2)
        >>>
        >>> # Create smaller student (explicit layer)
        >>> student = SimpleModel(Linear(100, 50))
        >>>
        >>> kd = KnowledgeDistillation(teacher, student, temperature=4.0, alpha=0.8)
        >>> print(f"Temperature: {kd.temperature}, Alpha: {kd.alpha}")
        Temperature: 4.0, Alpha: 0.8

        HINTS:
        - Simply assign the parameters to instance variables
        - Temperature typically ranges from 3-5 for effective softening
        - Alpha of 0.7 means 70% soft targets, 30% hard targets

        Args:
            teacher_model: Large, pre-trained model
            student_model: Smaller model to train
            temperature: Softening parameter for distributions
            alpha: Weight for soft target loss (1-alpha for hard targets)
        """
        ### BEGIN SOLUTION
        self.teacher_model = teacher_model
        self.student_model = student_model
        self.temperature = temperature
        self.alpha = alpha
        ### END SOLUTION

    def distillation_loss(self, student_logits, teacher_logits, true_labels):
        """
        Calculate combined distillation loss.

        TODO: Implement knowledge distillation loss function

        APPROACH:
        1. Calculate hard target loss (student vs true labels)
        2. Calculate soft target loss (student vs teacher, with temperature)
        3. Combine losses: alpha * soft_loss + (1-alpha) * hard_loss

        EXAMPLE:
        >>> kd = KnowledgeDistillation(teacher, student)
        >>> loss = kd.distillation_loss(student_out, teacher_out, labels)
        >>> print(f"Distillation loss: {loss:.4f}")

        HINTS:
        - Use temperature to soften distributions: logits/temperature
        - Soft targets use KL divergence or cross-entropy
        - Hard targets use standard classification loss
        """
        ### BEGIN SOLUTION
        # Extract numpy arrays from Tensors
        # student_logits and teacher_logits are always Tensors from forward passes
        student_logits = student_logits.data
        teacher_logits = teacher_logits.data

        # true_labels might be numpy array or Tensor
        if isinstance(true_labels, Tensor):
            true_labels = true_labels.data

        # Soften distributions with temperature
        student_soft = self._softmax(student_logits / self.temperature)
        teacher_soft = self._softmax(teacher_logits / self.temperature)

        # Soft target loss (KL divergence)
        soft_loss = self._kl_divergence(student_soft, teacher_soft)

        # Hard target loss (cross-entropy)
        student_hard = self._softmax(student_logits)
        hard_loss = self._cross_entropy(student_hard, true_labels)

        # Combined loss
        total_loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss

        return total_loss
        ### END SOLUTION

    def _softmax(self, logits):
        """Compute softmax with numerical stability."""
        exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
        return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)

    def _kl_divergence(self, p, q):
        """Compute KL divergence between distributions."""
        return np.sum(p * np.log(p / (q + 1e-8) + 1e-8))

    def _cross_entropy(self, predictions, labels):
        """Compute cross-entropy loss."""
        # Simple implementation for integer labels
        if labels.ndim == 1:
            return -np.mean(np.log(predictions[np.arange(len(labels)), labels] + 1e-8))
        else:
            return -np.mean(np.sum(labels * np.log(predictions + 1e-8), axis=1))

# %% ../../modules/16_compression/16_compression.ipynb 37
class CompressionComplete:
    """
    Complete compression system for milestone use.
    
    Provides pruning, distillation, and low-rank approximation techniques.
    """
    
    @staticmethod
    def measure_sparsity(model) -> float:
        """Measure the sparsity of a model (fraction of zero weights)."""
        # SimpleModel has .layers, each layer has .parameters() method
        total_params = 0
        zero_params = 0
        
        for layer in model.layers:
            for param in layer.parameters():
                total_params += param.size
                zero_params += np.sum(param.data == 0)
        
        return zero_params / total_params if total_params > 0 else 0.0
    
    @staticmethod
    def magnitude_prune(model, sparsity=0.5):
        """
        Prune model weights by magnitude (smallest weights set to zero).
        
        Args:
            model: SimpleModel with .layers attribute
            sparsity: Fraction of weights to prune (0-1)
        """
        # SimpleModel has .layers, each layer has .parameters() method
        for layer in model.layers:
            for param in layer.parameters():
                threshold = np.percentile(np.abs(param.data), sparsity * 100)
                param.data[np.abs(param.data) < threshold] = 0
        
        return model
    
    @staticmethod
    def structured_prune(model, prune_ratio=0.5):
        """
        Prune entire neurons/channels (structured pruning).
        
        Args:
            model: SimpleModel with .layers attribute
            prune_ratio: Fraction of structures to prune (0-1)
        """
        # SimpleModel has .layers, process Linear layers
        for layer in model.layers:
            if isinstance(layer, Linear):
                # Linear layers have .weight attribute with .data
                weight = layer.weight
                if len(weight.shape) == 2:  # Linear layer
                    # Prune output neurons
                    neuron_norms = np.linalg.norm(weight.data, axis=0)
                    threshold = np.percentile(neuron_norms, prune_ratio * 100)
                    mask = neuron_norms >= threshold
                    weight.data[:, ~mask] = 0
        
        return model
    
    @staticmethod
    def compress_model(model, compression_config: Dict[str, Any]):
        """
        Apply complete compression pipeline to a model.
        
        Args:
            model: Model to compress
            compression_config: Dictionary with compression settings
                - 'magnitude_sparsity': float (0-1)
                - 'structured_prune_ratio': float (0-1)
        
        Returns:
            Compressed model with sparsity stats
        """
        stats = {
            'original_sparsity': CompressionComplete.measure_sparsity(model)
        }
        
        # Apply magnitude pruning
        if 'magnitude_sparsity' in compression_config:
            model = CompressionComplete.magnitude_prune(
                model, compression_config['magnitude_sparsity']
            )
        
        # Apply structured pruning
        if 'structured_prune_ratio' in compression_config:
            model = CompressionComplete.structured_prune(
                model, compression_config['structured_prune_ratio']
            )
        
        stats['final_sparsity'] = CompressionComplete.measure_sparsity(model)
        stats['compression_ratio'] = 1.0 / (1.0 - stats['final_sparsity']) if stats['final_sparsity'] < 1.0 else float('inf')
        
        return model, stats

# Convenience functions for backward compatibility
def measure_sparsity(model) -> float:
    """Measure model sparsity."""
    return CompressionComplete.measure_sparsity(model)

def magnitude_prune(model, sparsity=0.5):
    """Apply magnitude-based pruning."""
    return CompressionComplete.magnitude_prune(model, sparsity)

def structured_prune(model, prune_ratio=0.5):
    """Apply structured pruning."""
    return CompressionComplete.structured_prune(model, prune_ratio)

def compress_model(model, compression_config: Dict[str, Any]):
    """Apply complete compression pipeline."""
    return CompressionComplete.compress_model(model, compression_config)