# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║                        🚨 CRITICAL WARNING 🚨                                ║
# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
# ║                                                                               ║
# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
# ║                                                                               ║
# ║  ✅ TO EDIT: src/XX_acceleration/XX_acceleration.py                 ║
# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
# ║                                                                               ║
# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
# ║     Editing it directly may break module functionality and training.         ║
# ║                                                                               ║
# ║  🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners)    ║
# ║     The tinytorch/ directory is generated code - edit source files instead!  ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['vectorized_matmul', 'fused_gelu', 'tiled_matmul']

# %% ../../modules/18_acceleration/18_acceleration.ipynb 0
#| default_exp optimization.acceleration
#| export

# %% ../../modules/18_acceleration/18_acceleration.ipynb 5
# Import from TinyTorch package (previous modules must be completed and exported)
from tinytorch.core.tensor import Tensor

# %% ../../modules/18_acceleration/18_acceleration.ipynb 7
def vectorized_matmul(a: Tensor, b: Tensor) -> Tensor:
    """
    High-performance matrix multiplication using vectorized operations.

    This implementation leverages optimized BLAS libraries that use:
    - SIMD instructions for parallel computation
    - Cache-blocking for memory efficiency
    - Multi-threading for CPU parallelization

    TODO: Implement production-grade matrix multiplication

    APPROACH:
    1. Validate shapes are compatible for matrix multiplication
    2. Use NumPy's optimized dot product (calls BLAS GEMM)
    3. Return result wrapped in Tensor

    Args:
        a: First tensor for multiplication (M×K or batch×M×K)
        b: Second tensor for multiplication (K×N or batch×K×N)

    Returns:
        Result tensor of shape (M×N or batch×M×N)

    EXAMPLE:
    Matrix multiplication visualization:
    >>> a = Tensor([[1, 2], [3, 4]])  # 2×2
    >>> b = Tensor([[5, 6], [7, 8]])  # 2×2
    >>> result = vectorized_matmul(a, b)
    >>> print(result.data)
    [[19 22]    # [1×5+2×7, 1×6+2×8] = [19, 22]
     [43 50]]   # [3×5+4×7, 3×6+4×8] = [43, 50]

    PERFORMANCE CHARACTERISTICS:
    - Time Complexity: O(N³) but highly optimized
    - Space Complexity: O(N²) for result
    - Arithmetic Intensity: 2N³ FLOPs / 3N² bytes = 2N/3 (good for large N)

    HINTS:
    - Check a.shape[-1] == b.shape[-2] for inner dimension match
    - Use np.matmul() for batch support and optimization
    - Trust BLAS to handle the vectorization magic
    """
    ### BEGIN SOLUTION
    # Input validation for matrix multiplication
    if len(a.shape) < 2 or len(b.shape) < 2:
        raise ValueError(
            f"Matrix multiplication requires 2D+ tensors, got shapes {a.shape} and {b.shape}. "
            f"💡 HINT: Use reshape() to add dimensions if needed."
        )

    if a.shape[-1] != b.shape[-2]:
        raise ValueError(
            f"Matrix multiplication shape mismatch: {a.shape} @ {b.shape}. "
            f"Inner dimensions must match: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}. "
            f"💡 HINT: For A@B, A's columns must equal B's rows."
        )

    # Use NumPy's highly optimized matrix multiplication
    # This calls BLAS GEMM (General Matrix Multiply), which uses:
    # - SIMD vectorization for parallel arithmetic
    # - Cache blocking for memory efficiency
    # - Multi-threading on multi-core systems
    result_data = np.matmul(a.data, b.data)

    return Tensor(result_data)
    ### END SOLUTION

# %% ../../modules/18_acceleration/18_acceleration.ipynb 10
def fused_gelu(x: Tensor) -> Tensor:
    """
    Fused GELU activation that combines all operations in a single kernel.

    GELU combines the benefits of ReLU and sigmoid:
    - Smooth everywhere (unlike ReLU's discontinuity at 0)
    - Non-saturating for positive values (unlike sigmoid)
    - Probabilistic interpretation: x * P(X ≤ x) where X ~ N(0,1)

    Mathematical Definition:
    GELU(x) = x * Φ(x) where Φ(x) is the standard normal CDF

    Fast Approximation (used here):
    GELU(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))

    TODO: Implement fused GELU to minimize memory bandwidth

    APPROACH:
    1. Compute all intermediate values in a single expression
    2. Avoid creating temporary arrays
    3. Let NumPy's broadcasting handle vectorization

    Args:
        x: Input tensor to apply GELU activation

    Returns:
        GELU-activated tensor (same shape as input)

    EXAMPLE:
    >>> x = Tensor([-2, -1, 0, 1, 2])
    >>> result = fused_gelu(x)
    >>> print(result.data)
    [-0.04550026 -0.15865526  0.          0.8413447   1.9544997 ]
    # Notice: smooth transition through 0, positive bias

    MEMORY EFFICIENCY:
    - Unfused: 5 temporary arrays × input_size × 4 bytes
    - Fused: 0 temporary arrays, direct computation
    - Bandwidth reduction: ~80% for memory-bound operations

    HINTS:
    - Use np.sqrt(2.0 / np.pi) for the constant
    - Keep entire expression in one line for maximum fusion
    - NumPy will optimize the expression tree automatically
    """
    ### BEGIN SOLUTION
    # Mathematical constant for GELU approximation
    sqrt_2_over_pi = np.sqrt(2.0 / np.pi)

    # Fused GELU computation - all operations in single expression
    # This minimizes memory bandwidth by avoiding intermediate arrays
    # NumPy's expression evaluator will optimize this into efficient machine code
    result_data = 0.5 * x.data * (
        1.0 + np.tanh(sqrt_2_over_pi * (x.data + 0.044715 * x.data**3))
    )

    return Tensor(result_data)
    ### END SOLUTION

# %% ../../modules/18_acceleration/18_acceleration.ipynb 16
def tiled_matmul(a: Tensor, b: Tensor, tile_size: int = 64) -> Tensor:
    """
    Cache-aware matrix multiplication using tiling/blocking.

    Demonstrates blocking algorithm for cache optimization by breaking
    large matrix multiplications into cache-sized chunks.

    TODO: Implement cache-aware tiled matrix multiplication

    APPROACH:
    1. Validate inputs for matrix multiplication compatibility
    2. Use NumPy's optimized matmul (which already implements tiling internally)
    3. In production, explicit tiling would use nested loops over blocks

    Args:
        a: First matrix (M×K)
        b: Second matrix (K×N)
        tile_size: Block size for cache efficiency (default: 64)

    Returns:
        Result matrix (M×N)

    EXAMPLE:
    >>> a = Tensor(np.random.randn(256, 256))
    >>> b = Tensor(np.random.randn(256, 256))
    >>> result = tiled_matmul(a, b, tile_size=64)
    >>> # Same result as vectorized_matmul, but more cache-friendly for large matrices

    PERFORMANCE CHARACTERISTICS:
    - Reduces cache misses by working on blocks that fit in L1/L2
    - Especially beneficial for matrices larger than cache size
    - tile_size should match cache line size (typically 64 bytes)

    HINTS:
    - For educational purposes, we use NumPy's optimized BLAS
    - BLAS libraries (MKL, OpenBLAS) already implement cache blocking
    - Explicit tiling would use 6 nested loops (3 for tiles, 3 for elements)
    """
    ### BEGIN SOLUTION
    # Input validation
    if len(a.shape) < 2 or len(b.shape) < 2:
        raise ValueError(
            f"Tiled matmul requires 2D+ tensors, got shapes {a.shape} and {b.shape}. "
            f"💡 HINT: Tiling works on matrix operations."
        )

    if a.shape[-1] != b.shape[-2]:
        raise ValueError(
            f"Shape mismatch: {a.shape} @ {b.shape}. "
            f"Inner dimensions must match for matrix multiplication. "
            f"💡 HINT: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}"
        )

    # For educational purposes, we use NumPy's matmul which already
    # implements cache-aware tiling via BLAS libraries (MKL, OpenBLAS)
    # These libraries automatically partition large matrices into
    # cache-sized blocks for optimal performance

    # In a full educational implementation, you would write:
    # for i_tile in range(0, M, tile_size):
    #     for j_tile in range(0, N, tile_size):
    #         for k_tile in range(0, K, tile_size):
    #             # Multiply tile blocks that fit in cache
    #             C[i_tile:i_tile+tile_size, j_tile:j_tile+tile_size] +=
    #                 A[i_tile:i_tile+tile_size, k_tile:k_tile+tile_size] @
    #                 B[k_tile:k_tile+tile_size, j_tile:j_tile+tile_size]

    result_data = np.matmul(a.data, b.data)
    return Tensor(result_data)
    ### END SOLUTION