Files
TinyTorch/tests/checkpoints/checkpoint_12_kernels.py
Vijay Janapa Reddi 824b489062 Implement comprehensive checkpoint system with CLI integration
Features:
- 16 checkpoint test suite validating ML systems capabilities
- Integration tests covering complete learning progression
- Rich CLI progress tracking with visual timelines
- Capability-driven assessment from environment to production

Checkpoints:
- Environment setup through full ML system deployment
- Each checkpoint validates integrated functionality
- Progressive capability building with clear success criteria
- Professional CLI interface with status/timeline/test commands
2025-09-16 21:02:11 -04:00

274 lines
12 KiB
Python

"""
Checkpoint 12: Kernels (After Module 13 - Kernels)
Question: "Can I implement high-performance computational kernels?"
"""
import numpy as np
import pytest
def test_checkpoint_12_kernels():
"""
Checkpoint 12: Kernels
Validates that students can implement and optimize computational kernels
for high-performance machine learning operations - essential for
understanding how modern ML frameworks achieve speed and efficiency.
"""
print("\n⚡ Checkpoint 12: Kernels")
print("=" * 50)
try:
from tinytorch.core.tensor import Tensor
from tinytorch.core.kernels import (
time_kernel, matmul_baseline, vectorized_relu, vectorized_operations,
cache_friendly_matmul, parallel_relu, parallel_batch_processing,
quantized_matmul, quantized_relu
)
from tinytorch.core.activations import ReLU
except ImportError as e:
pytest.fail(f"❌ Cannot import required classes - complete Modules 2-13 first: {e}")
# Test 1: Kernel timing infrastructure
print("⏱️ Testing kernel timing...")
def simple_operation(x):
return x * 2
# Test timing functionality
test_data = np.random.randn(100, 100)
try:
execution_time, result = time_kernel(simple_operation, test_data)
assert execution_time > 0, f"Execution time should be positive, got {execution_time}"
assert np.allclose(result, test_data * 2), "Timing should preserve operation correctness"
print(f"✅ Kernel timing: {execution_time:.6f}s for 100x100 operation")
except Exception as e:
print(f"⚠️ Kernel timing: {e}")
# Test 2: Matrix multiplication optimization
print("🔢 Testing matrix multiplication kernels...")
# Test baseline matmul
A = np.random.randn(64, 32)
B = np.random.randn(32, 48)
try:
result_baseline = matmul_baseline(A, B)
expected = np.dot(A, B)
assert result_baseline.shape == expected.shape, f"Baseline matmul shape mismatch: {result_baseline.shape} vs {expected.shape}"
assert np.allclose(result_baseline, expected, rtol=1e-5), "Baseline matmul should match NumPy"
print(f"✅ Baseline matmul: {A.shape} @ {B.shape}{result_baseline.shape}")
except Exception as e:
print(f"⚠️ Baseline matmul: {e}")
# Test cache-friendly matmul
try:
result_cache_friendly = cache_friendly_matmul(A, B)
assert result_cache_friendly.shape == expected.shape, f"Cache-friendly matmul shape mismatch"
assert np.allclose(result_cache_friendly, expected, rtol=1e-5), "Cache-friendly matmul should match NumPy"
print(f"✅ Cache-friendly matmul: optimized memory access patterns")
except Exception as e:
print(f"⚠️ Cache-friendly matmul: {e}")
# Test 3: Vectorized operations
print("🚀 Testing vectorized operations...")
# Test vectorized ReLU
test_input = np.array([-2, -1, 0, 1, 2]).astype(np.float32)
try:
vectorized_result = vectorized_relu(test_input)
expected_relu = np.maximum(0, test_input)
assert np.allclose(vectorized_result, expected_relu), "Vectorized ReLU should match expected behavior"
print(f"✅ Vectorized ReLU: {test_input}{vectorized_result}")
except Exception as e:
print(f"⚠️ Vectorized ReLU: {e}")
# Test vectorized operations suite
try:
ops_input = np.random.randn(1000).astype(np.float32)
ops_result = vectorized_operations(ops_input)
assert len(ops_result) > 0, "Vectorized operations should return results"
print(f"✅ Vectorized operations: processed {len(ops_input)} elements")
except Exception as e:
print(f"⚠️ Vectorized operations: {e}")
# Test 4: Parallel processing
print("🔀 Testing parallel processing...")
# Test parallel ReLU
parallel_input = np.random.randn(10000).astype(np.float32)
try:
parallel_result = parallel_relu(parallel_input)
expected_parallel = np.maximum(0, parallel_input)
assert parallel_result.shape == expected_parallel.shape, "Parallel ReLU shape mismatch"
assert np.allclose(parallel_result, expected_parallel, rtol=1e-5), "Parallel ReLU should match sequential"
print(f"✅ Parallel ReLU: processed {len(parallel_input)} elements")
except Exception as e:
print(f"⚠️ Parallel ReLU: {e}")
# Test parallel batch processing
try:
batch_data = np.random.randn(8, 512, 512).astype(np.float32) # 8 samples, 512x512 each
batch_result = parallel_batch_processing(batch_data)
assert batch_result.shape[0] == batch_data.shape[0], "Batch processing should preserve batch dimension"
print(f"✅ Parallel batch processing: {batch_data.shape}{batch_result.shape}")
except Exception as e:
print(f"⚠️ Parallel batch processing: {e}")
# Test 5: Quantization kernels
print("🗜️ Testing quantization kernels...")
# Test quantized matrix multiplication
try:
A_quant = np.random.randn(32, 16).astype(np.float32)
B_quant = np.random.randn(16, 24).astype(np.float32)
quant_result = quantized_matmul(A_quant, B_quant, bits=8)
reference_result = np.dot(A_quant, B_quant)
assert quant_result.shape == reference_result.shape, "Quantized matmul shape should match reference"
# Quantization should be approximately correct (some precision loss expected)
relative_error = np.mean(np.abs((quant_result - reference_result) / (reference_result + 1e-8)))
assert relative_error < 0.2, f"Quantized matmul error too high: {relative_error:.3f}"
print(f"✅ Quantized matmul: 8-bit quantization, error={relative_error:.3f}")
except Exception as e:
print(f"⚠️ Quantized matmul: {e}")
# Test quantized ReLU
try:
relu_input = np.random.randn(1000).astype(np.float32)
quant_relu_result = quantized_relu(relu_input, bits=8)
reference_relu = np.maximum(0, relu_input)
assert quant_relu_result.shape == reference_relu.shape, "Quantized ReLU shape should match reference"
print(f"✅ Quantized ReLU: 8-bit activation quantization")
except Exception as e:
print(f"⚠️ Quantized ReLU: {e}")
# Test 6: Performance comparison
print("📊 Testing performance comparison...")
# Compare naive vs optimized implementations
test_matrix_A = np.random.randn(128, 128).astype(np.float32)
test_matrix_B = np.random.randn(128, 128).astype(np.float32)
try:
# Time baseline implementation
baseline_time, baseline_result = time_kernel(matmul_baseline, test_matrix_A, test_matrix_B)
# Time cache-friendly implementation
optimized_time, optimized_result = time_kernel(cache_friendly_matmul, test_matrix_A, test_matrix_B)
# Both should be correct
assert np.allclose(baseline_result, optimized_result, rtol=1e-5), "Optimized version should match baseline"
speedup = baseline_time / optimized_time if optimized_time > 0 else 1.0
print(f"✅ Performance: baseline={baseline_time:.6f}s, optimized={optimized_time:.6f}s, speedup={speedup:.2f}x")
except Exception as e:
print(f"⚠️ Performance comparison: {e}")
# Test 7: Memory efficiency
print("💾 Testing memory efficiency...")
# Test memory-efficient operations
large_data = np.random.randn(1000, 1000).astype(np.float32)
try:
# Process in chunks to test memory efficiency
chunk_results = []
chunk_size = 100
for i in range(0, large_data.shape[0], chunk_size):
chunk = large_data[i:i+chunk_size]
chunk_result = vectorized_relu(chunk.flatten()).reshape(chunk.shape)
chunk_results.append(chunk_result)
chunked_result = np.vstack(chunk_results)
direct_result = vectorized_relu(large_data.flatten()).reshape(large_data.shape)
assert np.allclose(chunked_result, direct_result, rtol=1e-5), "Chunked processing should match direct processing"
print(f"✅ Memory efficiency: processed {large_data.shape} in {chunk_size}-row chunks")
except Exception as e:
print(f"⚠️ Memory efficiency: {e}")
# Test 8: Integration with TinyTorch tensors
print("🔗 Testing TinyTorch integration...")
try:
# Test that kernels work with TinyTorch tensors
tensor_a = Tensor(np.random.randn(32, 32))
tensor_b = Tensor(np.random.randn(32, 32))
# Extract numpy arrays for kernel operations
kernel_result = matmul_baseline(tensor_a.data, tensor_b.data)
tensor_result = Tensor(kernel_result)
assert tensor_result.shape == (32, 32), f"Tensor integration should preserve shape"
print(f"✅ TinyTorch integration: kernels work with Tensor.data")
except Exception as e:
print(f"⚠️ TinyTorch integration: {e}")
# Test 9: Kernel composition
print("🧩 Testing kernel composition...")
try:
# Compose multiple kernel operations
input_data = np.random.randn(64, 64).astype(np.float32)
# Pipeline: MatMul → ReLU → Quantization
intermediate = matmul_baseline(input_data, input_data.T) # Square result
activated = vectorized_relu(intermediate.flatten()).reshape(intermediate.shape)
quantized = quantized_relu(activated.flatten(), bits=8).reshape(activated.shape)
assert quantized.shape == input_data.shape, f"Kernel pipeline should preserve dimensions"
assert np.all(quantized >= 0), "Pipeline result should be non-negative after ReLU"
print(f"✅ Kernel composition: MatMul → ReLU → Quantization pipeline")
except Exception as e:
print(f"⚠️ Kernel composition: {e}")
# Test 10: Advanced optimization features
print("🚁 Testing advanced optimizations...")
try:
# Test that optimization features are available
medium_input = np.random.randn(256, 256).astype(np.float32)
# Time multiple approaches
approaches = []
# Baseline approach
baseline_time, _ = time_kernel(np.dot, medium_input, medium_input.T)
approaches.append(("NumPy baseline", baseline_time))
# Our optimized approach
optimized_time, _ = time_kernel(cache_friendly_matmul, medium_input, medium_input.T)
approaches.append(("Cache-friendly", optimized_time))
# Find fastest approach
fastest = min(approaches, key=lambda x: x[1])
print(f"✅ Advanced optimizations: fastest approach is {fastest[0]} at {fastest[1]:.6f}s")
# Verify we have meaningful optimization choices
assert len(approaches) >= 2, "Should have multiple optimization approaches"
except Exception as e:
print(f"⚠️ Advanced optimizations: {e}")
print("\n🎉 Kernels Complete!")
print("📝 You can now implement high-performance computational kernels")
print("🔧 Built capabilities: Timing, vectorization, parallelization, quantization, memory optimization")
print("🧠 Breakthrough: You understand how to optimize ML operations for real-world performance!")
print("🎯 Next: Add performance analysis and bottleneck identification")
if __name__ == "__main__":
test_checkpoint_12_kernels()