# ╔═══════════════════════════════════════════════════════════════════════════════╗ # ║ 🚨 CRITICAL WARNING 🚨 ║ # ║ AUTOGENERATED! DO NOT EDIT! ║ # ║ ║ # ║ This file is AUTOMATICALLY GENERATED from source modules. ║ # ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║ # ║ ║ # ║ ✅ TO EDIT: src/XX_acceleration/XX_acceleration.py ║ # ║ ✅ TO EXPORT: Run 'tito module complete ' ║ # ║ ║ # ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║ # ║ Editing it directly may break module functionality and training. ║ # ║ ║ # ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║ # ║ The tinytorch/ directory is generated code - edit source files instead! ║ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 __all__ = ['vectorized_matmul', 'fused_gelu', 'tiled_matmul'] # %% ../../modules/18_acceleration/18_acceleration.ipynb 0 #| default_exp optimization.acceleration #| export # %% ../../modules/18_acceleration/18_acceleration.ipynb 5 # Import from TinyTorch package (previous modules must be completed and exported) from tinytorch.core.tensor import Tensor # %% ../../modules/18_acceleration/18_acceleration.ipynb 7 def vectorized_matmul(a: Tensor, b: Tensor) -> Tensor: """ High-performance matrix multiplication using vectorized operations. This implementation leverages optimized BLAS libraries that use: - SIMD instructions for parallel computation - Cache-blocking for memory efficiency - Multi-threading for CPU parallelization TODO: Implement production-grade matrix multiplication APPROACH: 1. Validate shapes are compatible for matrix multiplication 2. Use NumPy's optimized dot product (calls BLAS GEMM) 3. Return result wrapped in Tensor Args: a: First tensor for multiplication (M×K or batch×M×K) b: Second tensor for multiplication (K×N or batch×K×N) Returns: Result tensor of shape (M×N or batch×M×N) EXAMPLE: Matrix multiplication visualization: >>> a = Tensor([[1, 2], [3, 4]]) # 2×2 >>> b = Tensor([[5, 6], [7, 8]]) # 2×2 >>> result = vectorized_matmul(a, b) >>> print(result.data) [[19 22] # [1×5+2×7, 1×6+2×8] = [19, 22] [43 50]] # [3×5+4×7, 3×6+4×8] = [43, 50] PERFORMANCE CHARACTERISTICS: - Time Complexity: O(N³) but highly optimized - Space Complexity: O(N²) for result - Arithmetic Intensity: 2N³ FLOPs / 3N² bytes = 2N/3 (good for large N) HINTS: - Check a.shape[-1] == b.shape[-2] for inner dimension match - Use np.matmul() for batch support and optimization - Trust BLAS to handle the vectorization magic """ ### BEGIN SOLUTION # Input validation for matrix multiplication if len(a.shape) < 2 or len(b.shape) < 2: raise ValueError( f"Matrix multiplication requires 2D+ tensors, got shapes {a.shape} and {b.shape}. " f"💡 HINT: Use reshape() to add dimensions if needed." ) if a.shape[-1] != b.shape[-2]: raise ValueError( f"Matrix multiplication shape mismatch: {a.shape} @ {b.shape}. " f"Inner dimensions must match: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}. " f"💡 HINT: For A@B, A's columns must equal B's rows." ) # Use NumPy's highly optimized matrix multiplication # This calls BLAS GEMM (General Matrix Multiply), which uses: # - SIMD vectorization for parallel arithmetic # - Cache blocking for memory efficiency # - Multi-threading on multi-core systems result_data = np.matmul(a.data, b.data) return Tensor(result_data) ### END SOLUTION # %% ../../modules/18_acceleration/18_acceleration.ipynb 10 def fused_gelu(x: Tensor) -> Tensor: """ Fused GELU activation that combines all operations in a single kernel. GELU combines the benefits of ReLU and sigmoid: - Smooth everywhere (unlike ReLU's discontinuity at 0) - Non-saturating for positive values (unlike sigmoid) - Probabilistic interpretation: x * P(X ≤ x) where X ~ N(0,1) Mathematical Definition: GELU(x) = x * Φ(x) where Φ(x) is the standard normal CDF Fast Approximation (used here): GELU(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³))) TODO: Implement fused GELU to minimize memory bandwidth APPROACH: 1. Compute all intermediate values in a single expression 2. Avoid creating temporary arrays 3. Let NumPy's broadcasting handle vectorization Args: x: Input tensor to apply GELU activation Returns: GELU-activated tensor (same shape as input) EXAMPLE: >>> x = Tensor([-2, -1, 0, 1, 2]) >>> result = fused_gelu(x) >>> print(result.data) [-0.04550026 -0.15865526 0. 0.8413447 1.9544997 ] # Notice: smooth transition through 0, positive bias MEMORY EFFICIENCY: - Unfused: 5 temporary arrays × input_size × 4 bytes - Fused: 0 temporary arrays, direct computation - Bandwidth reduction: ~80% for memory-bound operations HINTS: - Use np.sqrt(2.0 / np.pi) for the constant - Keep entire expression in one line for maximum fusion - NumPy will optimize the expression tree automatically """ ### BEGIN SOLUTION # Mathematical constant for GELU approximation sqrt_2_over_pi = np.sqrt(2.0 / np.pi) # Fused GELU computation - all operations in single expression # This minimizes memory bandwidth by avoiding intermediate arrays # NumPy's expression evaluator will optimize this into efficient machine code result_data = 0.5 * x.data * ( 1.0 + np.tanh(sqrt_2_over_pi * (x.data + 0.044715 * x.data**3)) ) return Tensor(result_data) ### END SOLUTION # %% ../../modules/18_acceleration/18_acceleration.ipynb 16 def tiled_matmul(a: Tensor, b: Tensor, tile_size: int = 64) -> Tensor: """ Cache-aware matrix multiplication using tiling/blocking. Demonstrates blocking algorithm for cache optimization by breaking large matrix multiplications into cache-sized chunks. TODO: Implement cache-aware tiled matrix multiplication APPROACH: 1. Validate inputs for matrix multiplication compatibility 2. Use NumPy's optimized matmul (which already implements tiling internally) 3. In production, explicit tiling would use nested loops over blocks Args: a: First matrix (M×K) b: Second matrix (K×N) tile_size: Block size for cache efficiency (default: 64) Returns: Result matrix (M×N) EXAMPLE: >>> a = Tensor(np.random.randn(256, 256)) >>> b = Tensor(np.random.randn(256, 256)) >>> result = tiled_matmul(a, b, tile_size=64) >>> # Same result as vectorized_matmul, but more cache-friendly for large matrices PERFORMANCE CHARACTERISTICS: - Reduces cache misses by working on blocks that fit in L1/L2 - Especially beneficial for matrices larger than cache size - tile_size should match cache line size (typically 64 bytes) HINTS: - For educational purposes, we use NumPy's optimized BLAS - BLAS libraries (MKL, OpenBLAS) already implement cache blocking - Explicit tiling would use 6 nested loops (3 for tiles, 3 for elements) """ ### BEGIN SOLUTION # Input validation if len(a.shape) < 2 or len(b.shape) < 2: raise ValueError( f"Tiled matmul requires 2D+ tensors, got shapes {a.shape} and {b.shape}. " f"💡 HINT: Tiling works on matrix operations." ) if a.shape[-1] != b.shape[-2]: raise ValueError( f"Shape mismatch: {a.shape} @ {b.shape}. " f"Inner dimensions must match for matrix multiplication. " f"💡 HINT: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}" ) # For educational purposes, we use NumPy's matmul which already # implements cache-aware tiling via BLAS libraries (MKL, OpenBLAS) # These libraries automatically partition large matrices into # cache-sized blocks for optimal performance # In a full educational implementation, you would write: # for i_tile in range(0, M, tile_size): # for j_tile in range(0, N, tile_size): # for k_tile in range(0, K, tile_size): # # Multiply tile blocks that fit in cache # C[i_tile:i_tile+tile_size, j_tile:j_tile+tile_size] += # A[i_tile:i_tile+tile_size, k_tile:k_tile+tile_size] @ # B[k_tile:k_tile+tile_size, j_tile:j_tile+tile_size] result_data = np.matmul(a.data, b.data) return Tensor(result_data) ### END SOLUTION