mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-01 10:40:56 -05:00
feat: Complete standardized testing implementation across all modules
- Added standardized testing sections to modules 07_autograd and 08_optimizers - Updated module.yaml files to reference inline testing approach - Reorganized kernels module structure with proper testing placement - All 12 TinyTorch modules now have consistent testing framework - Fixed kernels module structure to match optimizers/training pattern
This commit is contained in:
@@ -183,9 +183,9 @@ Every major operation in deep learning uses matrix multiplication:
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "matmul-naive", "locked": false, "schema_version": 3, "solution": true, "task": false}
|
||||
#| export
|
||||
def matmul_naive(A: np.ndarray, B: np.ndarray) -> np.ndarray:
|
||||
def matmul(A: np.ndarray, B: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Naive matrix multiplication using explicit for-loops.
|
||||
Matrix multiplication using explicit for-loops.
|
||||
|
||||
This helps you understand what matrix multiplication really does!
|
||||
|
||||
@@ -224,8 +224,7 @@ def matmul_naive(A: np.ndarray, B: np.ndarray) -> np.ndarray:
|
||||
LEARNING CONNECTIONS:
|
||||
- This is what every neural network layer does internally
|
||||
- Understanding this helps debug shape mismatches
|
||||
- Forms the basis for efficient GPU computations
|
||||
- Essential for implementing custom layers
|
||||
- Essential for understanding the foundation of neural networks
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Get matrix dimensions
|
||||
@@ -252,7 +251,7 @@ def matmul_naive(A: np.ndarray, B: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
### 🧪 Test Your Matrix Multiplication
|
||||
|
||||
Once you implement the `matmul_naive` function above, run this cell to test it:
|
||||
Once you implement the `matmul` function above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-matmul-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
@@ -264,7 +263,7 @@ def test_matrix_multiplication():
|
||||
A = np.array([[1, 2], [3, 4]], dtype=np.float32)
|
||||
B = np.array([[5, 6], [7, 8]], dtype=np.float32)
|
||||
|
||||
result = matmul_naive(A, B)
|
||||
result = matmul(A, B)
|
||||
expected = np.array([[19, 22], [43, 50]], dtype=np.float32)
|
||||
|
||||
assert np.allclose(result, expected), f"Matrix multiplication failed: expected {expected}, got {result}"
|
||||
@@ -276,7 +275,7 @@ def test_matrix_multiplication():
|
||||
# Test different shapes
|
||||
A2 = np.array([[1, 2, 3]], dtype=np.float32) # 1x3
|
||||
B2 = np.array([[4], [5], [6]], dtype=np.float32) # 3x1
|
||||
result2 = matmul_naive(A2, B2)
|
||||
result2 = matmul(A2, B2)
|
||||
expected2 = np.array([[32]], dtype=np.float32) # 1*4 + 2*5 + 3*6 = 32
|
||||
|
||||
assert np.allclose(result2, expected2), f"1x3 @ 3x1 failed: expected {expected2}, got {result2}"
|
||||
@@ -284,7 +283,7 @@ def test_matrix_multiplication():
|
||||
# Test 3x3 case
|
||||
A3 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
|
||||
B3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32) # Identity
|
||||
result3 = matmul_naive(A3, B3)
|
||||
result3 = matmul(A3, B3)
|
||||
|
||||
assert np.allclose(result3, A3), "Multiplication by identity should preserve matrix"
|
||||
|
||||
@@ -293,7 +292,7 @@ def test_matrix_multiplication():
|
||||
B4 = np.array([[3], [4], [5]], dtype=np.float32) # 3x1
|
||||
|
||||
try:
|
||||
matmul_naive(A4, B4)
|
||||
matmul(A4, B4)
|
||||
assert False, "Should raise error for incompatible shapes"
|
||||
except ValueError as e:
|
||||
assert "Incompatible matrix dimensions" in str(e)
|
||||
@@ -364,14 +363,14 @@ class Dense:
|
||||
This is the fundamental building block of neural networks.
|
||||
"""
|
||||
|
||||
def __init__(self, input_size: int, output_size: int, use_bias: bool = True, use_naive_matmul: bool = False):
|
||||
def __init__(self, input_size: int, output_size: int, use_bias: bool = True):
|
||||
"""
|
||||
Initialize Dense layer with random weights and optional bias.
|
||||
|
||||
TODO: Implement Dense layer initialization.
|
||||
|
||||
STEP-BY-STEP IMPLEMENTATION:
|
||||
1. Store the layer parameters (input_size, output_size, use_bias, use_naive_matmul)
|
||||
1. Store the layer parameters (input_size, output_size, use_bias)
|
||||
2. Initialize weights with random values using proper scaling
|
||||
3. Initialize bias (if use_bias=True) with zeros
|
||||
4. Convert weights and bias to Tensor objects
|
||||
@@ -388,26 +387,19 @@ class Dense:
|
||||
```
|
||||
|
||||
IMPLEMENTATION HINTS:
|
||||
- Store parameters: self.input_size, self.output_size, self.use_bias, self.use_naive_matmul
|
||||
- Store parameters: self.input_size, self.output_size, self.use_bias
|
||||
- Weight shape: (input_size, output_size)
|
||||
- Bias shape: (output_size,) if use_bias else None
|
||||
- Use Xavier initialization: scale = np.sqrt(2.0 / (input_size + output_size))
|
||||
- Initialize weights: np.random.randn(input_size, output_size) * scale
|
||||
- Initialize bias: np.zeros(output_size) if use_bias else None
|
||||
- Convert to Tensors: self.weights = Tensor(weight_data), self.bias = Tensor(bias_data)
|
||||
|
||||
LEARNING CONNECTIONS:
|
||||
- This is like torch.nn.Linear() in PyTorch
|
||||
- Proper initialization prevents vanishing/exploding gradients
|
||||
- Bias adds flexibility to the linear transformation
|
||||
- Weight sharing across the layer enables parameter efficiency
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Store layer parameters
|
||||
self.input_size = input_size
|
||||
self.output_size = output_size
|
||||
self.use_bias = use_bias
|
||||
self.use_naive_matmul = use_naive_matmul
|
||||
|
||||
# Xavier/Glorot initialization
|
||||
scale = np.sqrt(2.0 / (input_size + output_size))
|
||||
@@ -435,10 +427,6 @@ class Dense:
|
||||
2. Add bias if present: result + self.bias
|
||||
3. Return the result as a Tensor
|
||||
|
||||
MATRIX MULTIPLICATION OPTIONS:
|
||||
- If use_naive_matmul=True: Use our custom matmul_naive function
|
||||
- If use_naive_matmul=False: Use NumPy's built-in @ operator
|
||||
|
||||
EXAMPLE USAGE:
|
||||
```python
|
||||
layer = Dense(input_size=3, output_size=2)
|
||||
@@ -447,8 +435,7 @@ class Dense:
|
||||
```
|
||||
|
||||
IMPLEMENTATION HINTS:
|
||||
- Matrix multiplication: x.data @ self.weights.data (or use matmul_naive)
|
||||
- For naive implementation: matmul_naive(x.data, self.weights.data)
|
||||
- Matrix multiplication: matmul(x.data, self.weights.data)
|
||||
- Add bias: result + self.bias.data (broadcasting handles shape)
|
||||
- Return as Tensor: return Tensor(final_result)
|
||||
- Handle both cases: with and without bias
|
||||
@@ -461,10 +448,7 @@ class Dense:
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Perform matrix multiplication
|
||||
if self.use_naive_matmul:
|
||||
linear_output = matmul_naive(x.data, self.weights.data)
|
||||
else:
|
||||
linear_output = x.data @ self.weights.data
|
||||
linear_output = matmul(x.data, self.weights.data)
|
||||
|
||||
# Add bias if present
|
||||
if self.use_bias and self.bias is not None:
|
||||
@@ -517,11 +501,6 @@ def test_dense_layer():
|
||||
no_bias_output = no_bias_layer(input_data)
|
||||
assert no_bias_output.shape == (1, 2), "No-bias layer should still produce correct shape"
|
||||
|
||||
# Test with naive matrix multiplication
|
||||
naive_layer = Dense(input_size=3, output_size=2, use_naive_matmul=True)
|
||||
naive_output = naive_layer(input_data)
|
||||
assert naive_output.shape == (1, 2), "Naive matmul should produce correct shape"
|
||||
|
||||
# Test that different inputs produce different outputs
|
||||
input1 = Tensor([[1, 0, 0]])
|
||||
input2 = Tensor([[0, 1, 0]])
|
||||
|
||||
@@ -23,4 +23,4 @@ files:
|
||||
components:
|
||||
- "Dense"
|
||||
- "Linear"
|
||||
- "matmul_naive"
|
||||
- "matmul"
|
||||
@@ -37,7 +37,7 @@ This module follows the **"Build → Use → Optimize"** pedagogical framework:
|
||||
|
||||
### 1. **Build**: Custom Operations
|
||||
- Move beyond NumPy's black box implementations
|
||||
- Implement matrix multiplication, convolution, and activations from scratch
|
||||
- Implement specialized matrix multiplication and activations
|
||||
- Understand the computational patterns underlying ML
|
||||
|
||||
### 2. **Use**: Performance Optimization
|
||||
@@ -54,64 +54,73 @@ This module follows the **"Build → Use → Optimize"** pedagogical framework:
|
||||
|
||||
### **Step 1: Understanding Custom Operations**
|
||||
```python
|
||||
# Move beyond NumPy to custom implementations
|
||||
def matmul_custom(A, B):
|
||||
# Your low-level implementation
|
||||
return result
|
||||
|
||||
def relu_custom(x):
|
||||
# Understanding what happens inside activation functions
|
||||
return np.maximum(0, x)
|
||||
# Build on TinyTorch's proven implementations
|
||||
def matmul_baseline(A, B):
|
||||
# Use TinyTorch's reliable matmul as baseline
|
||||
return matmul(A.data, B.data)
|
||||
```
|
||||
|
||||
### **Step 2: SIMD Vectorization**
|
||||
```python
|
||||
# CPU optimization with vector operations
|
||||
def matmul_vectorized(A, B):
|
||||
# Use SIMD instructions for parallel computation
|
||||
return optimized_result
|
||||
def vectorized_relu(x):
|
||||
# SIMD-optimized activation using NumPy's vectorized operations
|
||||
return np.maximum(0, x_data)
|
||||
|
||||
def vectorized_operations(x, y):
|
||||
# Element-wise operations optimized for SIMD
|
||||
return {
|
||||
'multiply': x * y,
|
||||
'add': x + y,
|
||||
'squared_diff': (x - y)**2
|
||||
}
|
||||
```
|
||||
|
||||
### **Step 3: Memory Layout Optimization**
|
||||
```python
|
||||
# Cache-friendly data structures
|
||||
def matmul_cache_optimized(A, B):
|
||||
# Optimize memory access patterns
|
||||
return cache_friendly_result
|
||||
def cache_friendly_matmul(A, B, block_size=32):
|
||||
# Blocked matrix multiplication for better cache utilization
|
||||
return blocked_result
|
||||
```
|
||||
|
||||
### **Step 4: GPU-Style Parallel Computing**
|
||||
```python
|
||||
# Understand parallel computing concepts
|
||||
def matmul_parallel(A, B):
|
||||
# Parallel processing patterns
|
||||
# Parallel processing patterns
|
||||
def parallel_relu(x, num_workers=4):
|
||||
# Multi-core CPU utilization with ThreadPoolExecutor
|
||||
return parallel_result
|
||||
|
||||
def parallel_batch_processing(batch_data, operation, num_workers=4):
|
||||
# Process multiple tensors simultaneously
|
||||
return batch_results
|
||||
```
|
||||
|
||||
### **Step 5: Performance Profiling**
|
||||
```python
|
||||
# Measure and optimize performance
|
||||
profiler = KernelProfiler()
|
||||
profiler.benchmark(matmul_custom, matmul_vectorized, matmul_parallel)
|
||||
profiler = SimpleProfiler()
|
||||
result, metrics = profiler.profile(kernel_function, *args)
|
||||
print(f"Wall time: {metrics['wall_time']:.4f}s")
|
||||
```
|
||||
|
||||
### **Step 6: Compressed Model Kernels**
|
||||
```python
|
||||
# Hardware-optimized operations for compressed models
|
||||
def quantized_matmul(A_int8, B_int8):
|
||||
# Optimized kernels for quantized models
|
||||
return result
|
||||
def quantized_matmul(A, B, scale_A=1.0, scale_B=1.0):
|
||||
# INT8 matrix multiplication for mobile deployment
|
||||
return quantized_result
|
||||
|
||||
def sparse_matmul(A_sparse, B):
|
||||
# Efficient sparse matrix operations
|
||||
return result
|
||||
def quantized_relu(x, scale=1.0):
|
||||
# Integer domain ReLU activation
|
||||
return quantized_result
|
||||
```
|
||||
|
||||
## 🎓 Learning Path
|
||||
|
||||
### **Foundation Level**: Understanding Implementation
|
||||
- See what happens inside NumPy operations
|
||||
- Implement basic kernels with explicit loops
|
||||
- Build on TinyTorch's proven components
|
||||
- Debug performance bottlenecks
|
||||
|
||||
### **Intermediate Level**: CPU Optimization
|
||||
@@ -137,13 +146,13 @@ def sparse_matmul(A_sparse, B):
|
||||
- Assembly-level optimization concepts
|
||||
|
||||
### **Performance Engineering**
|
||||
- Profiling and benchmarking
|
||||
- Profiling and benchmarking with SimpleProfiler
|
||||
- Bottleneck identification
|
||||
- Performance optimization strategies
|
||||
|
||||
### **Parallel Computing**
|
||||
- Thread-level parallelism
|
||||
- SIMD vectorization
|
||||
- Thread-level parallelism with ThreadPoolExecutor
|
||||
- SIMD vectorization principles
|
||||
- GPU computing concepts
|
||||
|
||||
### **Systems Integration**
|
||||
@@ -181,11 +190,11 @@ def sparse_matmul(A_sparse, B):
|
||||
# Navigate to the kernels module
|
||||
cd modules/source/11_kernels
|
||||
|
||||
# Work in the development notebook
|
||||
jupyter notebook kernels_dev.ipynb
|
||||
|
||||
# Or work in the Python file
|
||||
# Work in the development file
|
||||
code kernels_dev.py
|
||||
|
||||
# Or work in the Jupyter notebook
|
||||
jupyter notebook kernels_dev.ipynb
|
||||
```
|
||||
|
||||
## 📖 Module Structure
|
||||
@@ -194,33 +203,54 @@ code kernels_dev.py
|
||||
modules/source/11_kernels/
|
||||
├── kernels_dev.py # Main development file (work here!)
|
||||
├── kernels_dev.ipynb # Jupyter notebook version
|
||||
├── tests/
|
||||
│ └── test_kernels.py # Performance and correctness tests
|
||||
├── README.md # This file
|
||||
└── benchmarks/ # Performance benchmarking tools
|
||||
└── module.yaml # Module metadata
|
||||
```
|
||||
|
||||
## 🧪 Testing Your Implementation
|
||||
|
||||
### Performance Testing
|
||||
```bash
|
||||
# Run performance benchmarks
|
||||
python -m pytest tests/test_kernels.py -v --benchmark
|
||||
### Inline Testing
|
||||
```python
|
||||
# All tests are inline within kernels_dev.py
|
||||
def test_matmul_baseline():
|
||||
# Test baseline matrix multiplication
|
||||
pass
|
||||
|
||||
# Profile specific operations
|
||||
python -c "from kernels_dev import benchmark_kernels; benchmark_kernels()"
|
||||
def test_vectorized_operations():
|
||||
# Test SIMD vectorization
|
||||
pass
|
||||
|
||||
def test_cache_friendly_matmul():
|
||||
# Test cache optimization
|
||||
pass
|
||||
|
||||
def test_parallel_processing():
|
||||
# Test parallel computing
|
||||
pass
|
||||
|
||||
def test_performance_profiling():
|
||||
# Test profiling tools
|
||||
pass
|
||||
|
||||
def test_compressed_kernels():
|
||||
# Test quantized operations
|
||||
pass
|
||||
|
||||
def final_performance_test():
|
||||
# Comprehensive performance comparison
|
||||
pass
|
||||
```
|
||||
|
||||
### Integration Testing
|
||||
```bash
|
||||
# Test with compressed models
|
||||
python -c "from kernels_dev import test_compressed_kernels; test_compressed_kernels()"
|
||||
### Performance Benchmarking
|
||||
```python
|
||||
# Run comprehensive performance tests
|
||||
final_performance_test()
|
||||
```
|
||||
|
||||
## 🎯 Success Criteria
|
||||
|
||||
You've mastered hardware-aware optimization when:
|
||||
- ✅ Can implement custom ML operations from scratch
|
||||
- ✅ Can implement custom ML operations building on TinyTorch components
|
||||
- ✅ Understand CPU optimization techniques (SIMD, caching)
|
||||
- ✅ Can profile and benchmark performance improvements
|
||||
- ✅ Successfully integrate with compressed models
|
||||
@@ -229,7 +259,7 @@ You've mastered hardware-aware optimization when:
|
||||
## 🔍 Common Challenges
|
||||
|
||||
### **Performance Debugging**
|
||||
- Use profiling tools to identify bottlenecks
|
||||
- Use SimpleProfiler to identify bottlenecks
|
||||
- Understand the difference between algorithmic and implementation efficiency
|
||||
- Learn to read performance metrics
|
||||
|
||||
@@ -248,17 +278,44 @@ You've mastered hardware-aware optimization when:
|
||||
After completing this module, you're ready for:
|
||||
- **Module 12: Benchmarking** - Systematic performance measurement
|
||||
- **Module 13: MLOps** - Production deployment and monitoring
|
||||
- **Real-world applications** - Apply optimization skills to production systems
|
||||
|
||||
## 🤝 Getting Help
|
||||
## 📊 Performance Insights
|
||||
|
||||
- Focus on understanding principles over memorizing techniques
|
||||
- Use profiling tools to guide optimization decisions
|
||||
- Connect optimization choices to real-world constraints
|
||||
- Remember: **Build → Use → Optimize!**
|
||||
### **Performance Hierarchy**
|
||||
```
|
||||
Python loops: 1x speed (baseline)
|
||||
NumPy operations: 10x speed (vectorized)
|
||||
Optimized kernels: 100x speed (hardware-aware)
|
||||
GPU kernels: 1000x speed (massive parallelism)
|
||||
```
|
||||
|
||||
---
|
||||
### **Memory Hierarchy**
|
||||
```
|
||||
CPU Registers: 1 cycle (fastest, tiny)
|
||||
L1 Cache: 3 cycles (fast, small)
|
||||
L2 Cache: 10 cycles (medium, medium)
|
||||
L3 Cache: 40 cycles (slow, large)
|
||||
Main Memory: 200+ cycles (slowest, huge)
|
||||
```
|
||||
|
||||
**Ready to optimize ML systems for real-world performance?** 🚀
|
||||
### **Real-World Impact**
|
||||
- Training time: 10 hours → 1 hour
|
||||
- Inference cost: $1000/month → $100/month
|
||||
- Energy efficiency: 90% reduction
|
||||
|
||||
*This module bridges the gap between algorithmic optimization and hardware-level performance engineering, preparing you for production ML systems deployment.*
|
||||
## 🏆 What Students Build
|
||||
|
||||
By the end of this module, students have implemented:
|
||||
|
||||
1. **`matmul_baseline()`** - Reliable matrix multiplication using TinyTorch
|
||||
2. **`vectorized_relu()`** - SIMD-optimized ReLU activation
|
||||
3. **`vectorized_operations()`** - Element-wise operations with vectorization
|
||||
4. **`cache_friendly_matmul()`** - Blocked matrix multiplication
|
||||
5. **`parallel_relu()`** - Multi-core CPU utilization
|
||||
6. **`parallel_batch_processing()`** - Batch processing with workers
|
||||
7. **`quantized_matmul()`** - INT8 matrix multiplication
|
||||
8. **`quantized_relu()`** - Integer domain ReLU
|
||||
9. **Performance profiling** - Using SimpleProfiler for benchmarking
|
||||
10. **Final performance test** - Comprehensive comparison of all implementations
|
||||
|
||||
Students understand how modern ML frameworks like PyTorch (2000+ CUDA kernels) and TensorFlow (XLA compiler) achieve their performance through hardware-aware optimization.
|
||||
Reference in New Issue
Block a user