mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-26 08:51:35 -05:00
Features: - 16 checkpoint test suite validating ML systems capabilities - Integration tests covering complete learning progression - Rich CLI progress tracking with visual timelines - Capability-driven assessment from environment to production Checkpoints: - Environment setup through full ML system deployment - Each checkpoint validates integrated functionality - Progressive capability building with clear success criteria - Professional CLI interface with status/timeline/test commands
274 lines
12 KiB
Python
274 lines
12 KiB
Python
"""
|
|
Checkpoint 12: Kernels (After Module 13 - Kernels)
|
|
Question: "Can I implement high-performance computational kernels?"
|
|
"""
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
def test_checkpoint_12_kernels():
|
|
"""
|
|
Checkpoint 12: Kernels
|
|
|
|
Validates that students can implement and optimize computational kernels
|
|
for high-performance machine learning operations - essential for
|
|
understanding how modern ML frameworks achieve speed and efficiency.
|
|
"""
|
|
print("\n⚡ Checkpoint 12: Kernels")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.kernels import (
|
|
time_kernel, matmul_baseline, vectorized_relu, vectorized_operations,
|
|
cache_friendly_matmul, parallel_relu, parallel_batch_processing,
|
|
quantized_matmul, quantized_relu
|
|
)
|
|
from tinytorch.core.activations import ReLU
|
|
except ImportError as e:
|
|
pytest.fail(f"❌ Cannot import required classes - complete Modules 2-13 first: {e}")
|
|
|
|
# Test 1: Kernel timing infrastructure
|
|
print("⏱️ Testing kernel timing...")
|
|
|
|
def simple_operation(x):
|
|
return x * 2
|
|
|
|
# Test timing functionality
|
|
test_data = np.random.randn(100, 100)
|
|
|
|
try:
|
|
execution_time, result = time_kernel(simple_operation, test_data)
|
|
|
|
assert execution_time > 0, f"Execution time should be positive, got {execution_time}"
|
|
assert np.allclose(result, test_data * 2), "Timing should preserve operation correctness"
|
|
print(f"✅ Kernel timing: {execution_time:.6f}s for 100x100 operation")
|
|
except Exception as e:
|
|
print(f"⚠️ Kernel timing: {e}")
|
|
|
|
# Test 2: Matrix multiplication optimization
|
|
print("🔢 Testing matrix multiplication kernels...")
|
|
|
|
# Test baseline matmul
|
|
A = np.random.randn(64, 32)
|
|
B = np.random.randn(32, 48)
|
|
|
|
try:
|
|
result_baseline = matmul_baseline(A, B)
|
|
expected = np.dot(A, B)
|
|
|
|
assert result_baseline.shape == expected.shape, f"Baseline matmul shape mismatch: {result_baseline.shape} vs {expected.shape}"
|
|
assert np.allclose(result_baseline, expected, rtol=1e-5), "Baseline matmul should match NumPy"
|
|
print(f"✅ Baseline matmul: {A.shape} @ {B.shape} → {result_baseline.shape}")
|
|
except Exception as e:
|
|
print(f"⚠️ Baseline matmul: {e}")
|
|
|
|
# Test cache-friendly matmul
|
|
try:
|
|
result_cache_friendly = cache_friendly_matmul(A, B)
|
|
|
|
assert result_cache_friendly.shape == expected.shape, f"Cache-friendly matmul shape mismatch"
|
|
assert np.allclose(result_cache_friendly, expected, rtol=1e-5), "Cache-friendly matmul should match NumPy"
|
|
print(f"✅ Cache-friendly matmul: optimized memory access patterns")
|
|
except Exception as e:
|
|
print(f"⚠️ Cache-friendly matmul: {e}")
|
|
|
|
# Test 3: Vectorized operations
|
|
print("🚀 Testing vectorized operations...")
|
|
|
|
# Test vectorized ReLU
|
|
test_input = np.array([-2, -1, 0, 1, 2]).astype(np.float32)
|
|
|
|
try:
|
|
vectorized_result = vectorized_relu(test_input)
|
|
expected_relu = np.maximum(0, test_input)
|
|
|
|
assert np.allclose(vectorized_result, expected_relu), "Vectorized ReLU should match expected behavior"
|
|
print(f"✅ Vectorized ReLU: {test_input} → {vectorized_result}")
|
|
except Exception as e:
|
|
print(f"⚠️ Vectorized ReLU: {e}")
|
|
|
|
# Test vectorized operations suite
|
|
try:
|
|
ops_input = np.random.randn(1000).astype(np.float32)
|
|
ops_result = vectorized_operations(ops_input)
|
|
|
|
assert len(ops_result) > 0, "Vectorized operations should return results"
|
|
print(f"✅ Vectorized operations: processed {len(ops_input)} elements")
|
|
except Exception as e:
|
|
print(f"⚠️ Vectorized operations: {e}")
|
|
|
|
# Test 4: Parallel processing
|
|
print("🔀 Testing parallel processing...")
|
|
|
|
# Test parallel ReLU
|
|
parallel_input = np.random.randn(10000).astype(np.float32)
|
|
|
|
try:
|
|
parallel_result = parallel_relu(parallel_input)
|
|
expected_parallel = np.maximum(0, parallel_input)
|
|
|
|
assert parallel_result.shape == expected_parallel.shape, "Parallel ReLU shape mismatch"
|
|
assert np.allclose(parallel_result, expected_parallel, rtol=1e-5), "Parallel ReLU should match sequential"
|
|
print(f"✅ Parallel ReLU: processed {len(parallel_input)} elements")
|
|
except Exception as e:
|
|
print(f"⚠️ Parallel ReLU: {e}")
|
|
|
|
# Test parallel batch processing
|
|
try:
|
|
batch_data = np.random.randn(8, 512, 512).astype(np.float32) # 8 samples, 512x512 each
|
|
batch_result = parallel_batch_processing(batch_data)
|
|
|
|
assert batch_result.shape[0] == batch_data.shape[0], "Batch processing should preserve batch dimension"
|
|
print(f"✅ Parallel batch processing: {batch_data.shape} → {batch_result.shape}")
|
|
except Exception as e:
|
|
print(f"⚠️ Parallel batch processing: {e}")
|
|
|
|
# Test 5: Quantization kernels
|
|
print("🗜️ Testing quantization kernels...")
|
|
|
|
# Test quantized matrix multiplication
|
|
try:
|
|
A_quant = np.random.randn(32, 16).astype(np.float32)
|
|
B_quant = np.random.randn(16, 24).astype(np.float32)
|
|
|
|
quant_result = quantized_matmul(A_quant, B_quant, bits=8)
|
|
reference_result = np.dot(A_quant, B_quant)
|
|
|
|
assert quant_result.shape == reference_result.shape, "Quantized matmul shape should match reference"
|
|
|
|
# Quantization should be approximately correct (some precision loss expected)
|
|
relative_error = np.mean(np.abs((quant_result - reference_result) / (reference_result + 1e-8)))
|
|
assert relative_error < 0.2, f"Quantized matmul error too high: {relative_error:.3f}"
|
|
print(f"✅ Quantized matmul: 8-bit quantization, error={relative_error:.3f}")
|
|
except Exception as e:
|
|
print(f"⚠️ Quantized matmul: {e}")
|
|
|
|
# Test quantized ReLU
|
|
try:
|
|
relu_input = np.random.randn(1000).astype(np.float32)
|
|
quant_relu_result = quantized_relu(relu_input, bits=8)
|
|
reference_relu = np.maximum(0, relu_input)
|
|
|
|
assert quant_relu_result.shape == reference_relu.shape, "Quantized ReLU shape should match reference"
|
|
print(f"✅ Quantized ReLU: 8-bit activation quantization")
|
|
except Exception as e:
|
|
print(f"⚠️ Quantized ReLU: {e}")
|
|
|
|
# Test 6: Performance comparison
|
|
print("📊 Testing performance comparison...")
|
|
|
|
# Compare naive vs optimized implementations
|
|
test_matrix_A = np.random.randn(128, 128).astype(np.float32)
|
|
test_matrix_B = np.random.randn(128, 128).astype(np.float32)
|
|
|
|
try:
|
|
# Time baseline implementation
|
|
baseline_time, baseline_result = time_kernel(matmul_baseline, test_matrix_A, test_matrix_B)
|
|
|
|
# Time cache-friendly implementation
|
|
optimized_time, optimized_result = time_kernel(cache_friendly_matmul, test_matrix_A, test_matrix_B)
|
|
|
|
# Both should be correct
|
|
assert np.allclose(baseline_result, optimized_result, rtol=1e-5), "Optimized version should match baseline"
|
|
|
|
speedup = baseline_time / optimized_time if optimized_time > 0 else 1.0
|
|
print(f"✅ Performance: baseline={baseline_time:.6f}s, optimized={optimized_time:.6f}s, speedup={speedup:.2f}x")
|
|
except Exception as e:
|
|
print(f"⚠️ Performance comparison: {e}")
|
|
|
|
# Test 7: Memory efficiency
|
|
print("💾 Testing memory efficiency...")
|
|
|
|
# Test memory-efficient operations
|
|
large_data = np.random.randn(1000, 1000).astype(np.float32)
|
|
|
|
try:
|
|
# Process in chunks to test memory efficiency
|
|
chunk_results = []
|
|
chunk_size = 100
|
|
|
|
for i in range(0, large_data.shape[0], chunk_size):
|
|
chunk = large_data[i:i+chunk_size]
|
|
chunk_result = vectorized_relu(chunk.flatten()).reshape(chunk.shape)
|
|
chunk_results.append(chunk_result)
|
|
|
|
chunked_result = np.vstack(chunk_results)
|
|
direct_result = vectorized_relu(large_data.flatten()).reshape(large_data.shape)
|
|
|
|
assert np.allclose(chunked_result, direct_result, rtol=1e-5), "Chunked processing should match direct processing"
|
|
print(f"✅ Memory efficiency: processed {large_data.shape} in {chunk_size}-row chunks")
|
|
except Exception as e:
|
|
print(f"⚠️ Memory efficiency: {e}")
|
|
|
|
# Test 8: Integration with TinyTorch tensors
|
|
print("🔗 Testing TinyTorch integration...")
|
|
|
|
try:
|
|
# Test that kernels work with TinyTorch tensors
|
|
tensor_a = Tensor(np.random.randn(32, 32))
|
|
tensor_b = Tensor(np.random.randn(32, 32))
|
|
|
|
# Extract numpy arrays for kernel operations
|
|
kernel_result = matmul_baseline(tensor_a.data, tensor_b.data)
|
|
tensor_result = Tensor(kernel_result)
|
|
|
|
assert tensor_result.shape == (32, 32), f"Tensor integration should preserve shape"
|
|
print(f"✅ TinyTorch integration: kernels work with Tensor.data")
|
|
except Exception as e:
|
|
print(f"⚠️ TinyTorch integration: {e}")
|
|
|
|
# Test 9: Kernel composition
|
|
print("🧩 Testing kernel composition...")
|
|
|
|
try:
|
|
# Compose multiple kernel operations
|
|
input_data = np.random.randn(64, 64).astype(np.float32)
|
|
|
|
# Pipeline: MatMul → ReLU → Quantization
|
|
intermediate = matmul_baseline(input_data, input_data.T) # Square result
|
|
activated = vectorized_relu(intermediate.flatten()).reshape(intermediate.shape)
|
|
quantized = quantized_relu(activated.flatten(), bits=8).reshape(activated.shape)
|
|
|
|
assert quantized.shape == input_data.shape, f"Kernel pipeline should preserve dimensions"
|
|
assert np.all(quantized >= 0), "Pipeline result should be non-negative after ReLU"
|
|
print(f"✅ Kernel composition: MatMul → ReLU → Quantization pipeline")
|
|
except Exception as e:
|
|
print(f"⚠️ Kernel composition: {e}")
|
|
|
|
# Test 10: Advanced optimization features
|
|
print("🚁 Testing advanced optimizations...")
|
|
|
|
try:
|
|
# Test that optimization features are available
|
|
medium_input = np.random.randn(256, 256).astype(np.float32)
|
|
|
|
# Time multiple approaches
|
|
approaches = []
|
|
|
|
# Baseline approach
|
|
baseline_time, _ = time_kernel(np.dot, medium_input, medium_input.T)
|
|
approaches.append(("NumPy baseline", baseline_time))
|
|
|
|
# Our optimized approach
|
|
optimized_time, _ = time_kernel(cache_friendly_matmul, medium_input, medium_input.T)
|
|
approaches.append(("Cache-friendly", optimized_time))
|
|
|
|
# Find fastest approach
|
|
fastest = min(approaches, key=lambda x: x[1])
|
|
print(f"✅ Advanced optimizations: fastest approach is {fastest[0]} at {fastest[1]:.6f}s")
|
|
|
|
# Verify we have meaningful optimization choices
|
|
assert len(approaches) >= 2, "Should have multiple optimization approaches"
|
|
|
|
except Exception as e:
|
|
print(f"⚠️ Advanced optimizations: {e}")
|
|
|
|
print("\n🎉 Kernels Complete!")
|
|
print("📝 You can now implement high-performance computational kernels")
|
|
print("🔧 Built capabilities: Timing, vectorization, parallelization, quantization, memory optimization")
|
|
print("🧠 Breakthrough: You understand how to optimize ML operations for real-world performance!")
|
|
print("🎯 Next: Add performance analysis and bottleneck identification")
|
|
|
|
if __name__ == "__main__":
|
|
test_checkpoint_12_kernels() |