mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-10 02:12:00 -05:00
227 lines
9.4 KiB
Python
Generated
227 lines
9.4 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||
# ║ ║
|
||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||
# ║ ║
|
||
# ║ ✅ TO EDIT: src/XX_acceleration/XX_acceleration.py ║
|
||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||
# ║ ║
|
||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||
# ║ Editing it directly may break module functionality and training. ║
|
||
# ║ ║
|
||
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
|
||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||
# %% auto 0
|
||
__all__ = ['vectorized_matmul', 'fused_gelu', 'tiled_matmul']
|
||
|
||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 0
|
||
#| default_exp optimization.acceleration
|
||
#| export
|
||
|
||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 5
|
||
# Import from TinyTorch package (previous modules must be completed and exported)
|
||
from tinytorch.core.tensor import Tensor
|
||
|
||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 7
|
||
def vectorized_matmul(a: Tensor, b: Tensor) -> Tensor:
|
||
"""
|
||
High-performance matrix multiplication using vectorized operations.
|
||
|
||
This implementation leverages optimized BLAS libraries that use:
|
||
- SIMD instructions for parallel computation
|
||
- Cache-blocking for memory efficiency
|
||
- Multi-threading for CPU parallelization
|
||
|
||
TODO: Implement production-grade matrix multiplication
|
||
|
||
APPROACH:
|
||
1. Validate shapes are compatible for matrix multiplication
|
||
2. Use NumPy's optimized dot product (calls BLAS GEMM)
|
||
3. Return result wrapped in Tensor
|
||
|
||
Args:
|
||
a: First tensor for multiplication (M×K or batch×M×K)
|
||
b: Second tensor for multiplication (K×N or batch×K×N)
|
||
|
||
Returns:
|
||
Result tensor of shape (M×N or batch×M×N)
|
||
|
||
EXAMPLE:
|
||
Matrix multiplication visualization:
|
||
>>> a = Tensor([[1, 2], [3, 4]]) # 2×2
|
||
>>> b = Tensor([[5, 6], [7, 8]]) # 2×2
|
||
>>> result = vectorized_matmul(a, b)
|
||
>>> print(result.data)
|
||
[[19 22] # [1×5+2×7, 1×6+2×8] = [19, 22]
|
||
[43 50]] # [3×5+4×7, 3×6+4×8] = [43, 50]
|
||
|
||
PERFORMANCE CHARACTERISTICS:
|
||
- Time Complexity: O(N³) but highly optimized
|
||
- Space Complexity: O(N²) for result
|
||
- Arithmetic Intensity: 2N³ FLOPs / 3N² bytes = 2N/3 (good for large N)
|
||
|
||
HINTS:
|
||
- Check a.shape[-1] == b.shape[-2] for inner dimension match
|
||
- Use np.matmul() for batch support and optimization
|
||
- Trust BLAS to handle the vectorization magic
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Input validation for matrix multiplication
|
||
if len(a.shape) < 2 or len(b.shape) < 2:
|
||
raise ValueError(
|
||
f"Matrix multiplication requires 2D+ tensors, got shapes {a.shape} and {b.shape}. "
|
||
f"💡 HINT: Use reshape() to add dimensions if needed."
|
||
)
|
||
|
||
if a.shape[-1] != b.shape[-2]:
|
||
raise ValueError(
|
||
f"Matrix multiplication shape mismatch: {a.shape} @ {b.shape}. "
|
||
f"Inner dimensions must match: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}. "
|
||
f"💡 HINT: For A@B, A's columns must equal B's rows."
|
||
)
|
||
|
||
# Use NumPy's highly optimized matrix multiplication
|
||
# This calls BLAS GEMM (General Matrix Multiply), which uses:
|
||
# - SIMD vectorization for parallel arithmetic
|
||
# - Cache blocking for memory efficiency
|
||
# - Multi-threading on multi-core systems
|
||
result_data = np.matmul(a.data, b.data)
|
||
|
||
return Tensor(result_data)
|
||
### END SOLUTION
|
||
|
||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 10
|
||
def fused_gelu(x: Tensor) -> Tensor:
|
||
"""
|
||
Fused GELU activation that combines all operations in a single kernel.
|
||
|
||
GELU combines the benefits of ReLU and sigmoid:
|
||
- Smooth everywhere (unlike ReLU's discontinuity at 0)
|
||
- Non-saturating for positive values (unlike sigmoid)
|
||
- Probabilistic interpretation: x * P(X ≤ x) where X ~ N(0,1)
|
||
|
||
Mathematical Definition:
|
||
GELU(x) = x * Φ(x) where Φ(x) is the standard normal CDF
|
||
|
||
Fast Approximation (used here):
|
||
GELU(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
|
||
|
||
TODO: Implement fused GELU to minimize memory bandwidth
|
||
|
||
APPROACH:
|
||
1. Compute all intermediate values in a single expression
|
||
2. Avoid creating temporary arrays
|
||
3. Let NumPy's broadcasting handle vectorization
|
||
|
||
Args:
|
||
x: Input tensor to apply GELU activation
|
||
|
||
Returns:
|
||
GELU-activated tensor (same shape as input)
|
||
|
||
EXAMPLE:
|
||
>>> x = Tensor([-2, -1, 0, 1, 2])
|
||
>>> result = fused_gelu(x)
|
||
>>> print(result.data)
|
||
[-0.04550026 -0.15865526 0. 0.8413447 1.9544997 ]
|
||
# Notice: smooth transition through 0, positive bias
|
||
|
||
MEMORY EFFICIENCY:
|
||
- Unfused: 5 temporary arrays × input_size × 4 bytes
|
||
- Fused: 0 temporary arrays, direct computation
|
||
- Bandwidth reduction: ~80% for memory-bound operations
|
||
|
||
HINTS:
|
||
- Use np.sqrt(2.0 / np.pi) for the constant
|
||
- Keep entire expression in one line for maximum fusion
|
||
- NumPy will optimize the expression tree automatically
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Mathematical constant for GELU approximation
|
||
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
|
||
|
||
# Fused GELU computation - all operations in single expression
|
||
# This minimizes memory bandwidth by avoiding intermediate arrays
|
||
# NumPy's expression evaluator will optimize this into efficient machine code
|
||
result_data = 0.5 * x.data * (
|
||
1.0 + np.tanh(sqrt_2_over_pi * (x.data + 0.044715 * x.data**3))
|
||
)
|
||
|
||
return Tensor(result_data)
|
||
### END SOLUTION
|
||
|
||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 16
|
||
def tiled_matmul(a: Tensor, b: Tensor, tile_size: int = 64) -> Tensor:
|
||
"""
|
||
Cache-aware matrix multiplication using tiling/blocking.
|
||
|
||
Demonstrates blocking algorithm for cache optimization by breaking
|
||
large matrix multiplications into cache-sized chunks.
|
||
|
||
TODO: Implement cache-aware tiled matrix multiplication
|
||
|
||
APPROACH:
|
||
1. Validate inputs for matrix multiplication compatibility
|
||
2. Use NumPy's optimized matmul (which already implements tiling internally)
|
||
3. In production, explicit tiling would use nested loops over blocks
|
||
|
||
Args:
|
||
a: First matrix (M×K)
|
||
b: Second matrix (K×N)
|
||
tile_size: Block size for cache efficiency (default: 64)
|
||
|
||
Returns:
|
||
Result matrix (M×N)
|
||
|
||
EXAMPLE:
|
||
>>> a = Tensor(np.random.randn(256, 256))
|
||
>>> b = Tensor(np.random.randn(256, 256))
|
||
>>> result = tiled_matmul(a, b, tile_size=64)
|
||
>>> # Same result as vectorized_matmul, but more cache-friendly for large matrices
|
||
|
||
PERFORMANCE CHARACTERISTICS:
|
||
- Reduces cache misses by working on blocks that fit in L1/L2
|
||
- Especially beneficial for matrices larger than cache size
|
||
- tile_size should match cache line size (typically 64 bytes)
|
||
|
||
HINTS:
|
||
- For educational purposes, we use NumPy's optimized BLAS
|
||
- BLAS libraries (MKL, OpenBLAS) already implement cache blocking
|
||
- Explicit tiling would use 6 nested loops (3 for tiles, 3 for elements)
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Input validation
|
||
if len(a.shape) < 2 or len(b.shape) < 2:
|
||
raise ValueError(
|
||
f"Tiled matmul requires 2D+ tensors, got shapes {a.shape} and {b.shape}. "
|
||
f"💡 HINT: Tiling works on matrix operations."
|
||
)
|
||
|
||
if a.shape[-1] != b.shape[-2]:
|
||
raise ValueError(
|
||
f"Shape mismatch: {a.shape} @ {b.shape}. "
|
||
f"Inner dimensions must match for matrix multiplication. "
|
||
f"💡 HINT: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}"
|
||
)
|
||
|
||
# For educational purposes, we use NumPy's matmul which already
|
||
# implements cache-aware tiling via BLAS libraries (MKL, OpenBLAS)
|
||
# These libraries automatically partition large matrices into
|
||
# cache-sized blocks for optimal performance
|
||
|
||
# In a full educational implementation, you would write:
|
||
# for i_tile in range(0, M, tile_size):
|
||
# for j_tile in range(0, N, tile_size):
|
||
# for k_tile in range(0, K, tile_size):
|
||
# # Multiply tile blocks that fit in cache
|
||
# C[i_tile:i_tile+tile_size, j_tile:j_tile+tile_size] +=
|
||
# A[i_tile:i_tile+tile_size, k_tile:k_tile+tile_size] @
|
||
# B[k_tile:k_tile+tile_size, j_tile:j_tile+tile_size]
|
||
|
||
result_data = np.matmul(a.data, b.data)
|
||
return Tensor(result_data)
|
||
### END SOLUTION
|