""" Checkpoint 12: Kernels (After Module 13 - Kernels) Question: "Can I implement high-performance computational kernels?" """ import numpy as np import pytest def test_checkpoint_12_kernels(): """ Checkpoint 12: Kernels Validates that students can implement and optimize computational kernels for high-performance machine learning operations - essential for understanding how modern ML frameworks achieve speed and efficiency. """ print("\n⚡ Checkpoint 12: Kernels") print("=" * 50) try: from tinytorch.core.tensor import Tensor from tinytorch.core.kernels import ( time_kernel, matmul_baseline, vectorized_relu, vectorized_operations, cache_friendly_matmul, parallel_relu, parallel_batch_processing, quantized_matmul, quantized_relu ) from tinytorch.core.activations import ReLU except ImportError as e: pytest.fail(f"❌ Cannot import required classes - complete Modules 2-13 first: {e}") # Test 1: Kernel timing infrastructure print("⏱️ Testing kernel timing...") def simple_operation(x): return x * 2 # Test timing functionality test_data = np.random.randn(100, 100) try: execution_time, result = time_kernel(simple_operation, test_data) assert execution_time > 0, f"Execution time should be positive, got {execution_time}" assert np.allclose(result, test_data * 2), "Timing should preserve operation correctness" print(f"✅ Kernel timing: {execution_time:.6f}s for 100x100 operation") except Exception as e: print(f"⚠️ Kernel timing: {e}") # Test 2: Matrix multiplication optimization print("🔢 Testing matrix multiplication kernels...") # Test baseline matmul A = np.random.randn(64, 32) B = np.random.randn(32, 48) try: result_baseline = matmul_baseline(A, B) expected = np.dot(A, B) assert result_baseline.shape == expected.shape, f"Baseline matmul shape mismatch: {result_baseline.shape} vs {expected.shape}" assert np.allclose(result_baseline, expected, rtol=1e-5), "Baseline matmul should match NumPy" print(f"✅ Baseline matmul: {A.shape} @ {B.shape} → {result_baseline.shape}") except Exception as e: print(f"⚠️ Baseline matmul: {e}") # Test cache-friendly matmul try: result_cache_friendly = cache_friendly_matmul(A, B) assert result_cache_friendly.shape == expected.shape, f"Cache-friendly matmul shape mismatch" assert np.allclose(result_cache_friendly, expected, rtol=1e-5), "Cache-friendly matmul should match NumPy" print(f"✅ Cache-friendly matmul: optimized memory access patterns") except Exception as e: print(f"⚠️ Cache-friendly matmul: {e}") # Test 3: Vectorized operations print("🚀 Testing vectorized operations...") # Test vectorized ReLU test_input = np.array([-2, -1, 0, 1, 2]).astype(np.float32) try: vectorized_result = vectorized_relu(test_input) expected_relu = np.maximum(0, test_input) assert np.allclose(vectorized_result, expected_relu), "Vectorized ReLU should match expected behavior" print(f"✅ Vectorized ReLU: {test_input} → {vectorized_result}") except Exception as e: print(f"⚠️ Vectorized ReLU: {e}") # Test vectorized operations suite try: ops_input = np.random.randn(1000).astype(np.float32) ops_result = vectorized_operations(ops_input) assert len(ops_result) > 0, "Vectorized operations should return results" print(f"✅ Vectorized operations: processed {len(ops_input)} elements") except Exception as e: print(f"⚠️ Vectorized operations: {e}") # Test 4: Parallel processing print("🔀 Testing parallel processing...") # Test parallel ReLU parallel_input = np.random.randn(10000).astype(np.float32) try: parallel_result = parallel_relu(parallel_input) expected_parallel = np.maximum(0, parallel_input) assert parallel_result.shape == expected_parallel.shape, "Parallel ReLU shape mismatch" assert np.allclose(parallel_result, expected_parallel, rtol=1e-5), "Parallel ReLU should match sequential" print(f"✅ Parallel ReLU: processed {len(parallel_input)} elements") except Exception as e: print(f"⚠️ Parallel ReLU: {e}") # Test parallel batch processing try: batch_data = np.random.randn(8, 512, 512).astype(np.float32) # 8 samples, 512x512 each batch_result = parallel_batch_processing(batch_data) assert batch_result.shape[0] == batch_data.shape[0], "Batch processing should preserve batch dimension" print(f"✅ Parallel batch processing: {batch_data.shape} → {batch_result.shape}") except Exception as e: print(f"⚠️ Parallel batch processing: {e}") # Test 5: Quantization kernels print("🗜️ Testing quantization kernels...") # Test quantized matrix multiplication try: A_quant = np.random.randn(32, 16).astype(np.float32) B_quant = np.random.randn(16, 24).astype(np.float32) quant_result = quantized_matmul(A_quant, B_quant, bits=8) reference_result = np.dot(A_quant, B_quant) assert quant_result.shape == reference_result.shape, "Quantized matmul shape should match reference" # Quantization should be approximately correct (some precision loss expected) relative_error = np.mean(np.abs((quant_result - reference_result) / (reference_result + 1e-8))) assert relative_error < 0.2, f"Quantized matmul error too high: {relative_error:.3f}" print(f"✅ Quantized matmul: 8-bit quantization, error={relative_error:.3f}") except Exception as e: print(f"⚠️ Quantized matmul: {e}") # Test quantized ReLU try: relu_input = np.random.randn(1000).astype(np.float32) quant_relu_result = quantized_relu(relu_input, bits=8) reference_relu = np.maximum(0, relu_input) assert quant_relu_result.shape == reference_relu.shape, "Quantized ReLU shape should match reference" print(f"✅ Quantized ReLU: 8-bit activation quantization") except Exception as e: print(f"⚠️ Quantized ReLU: {e}") # Test 6: Performance comparison print("📊 Testing performance comparison...") # Compare naive vs optimized implementations test_matrix_A = np.random.randn(128, 128).astype(np.float32) test_matrix_B = np.random.randn(128, 128).astype(np.float32) try: # Time baseline implementation baseline_time, baseline_result = time_kernel(matmul_baseline, test_matrix_A, test_matrix_B) # Time cache-friendly implementation optimized_time, optimized_result = time_kernel(cache_friendly_matmul, test_matrix_A, test_matrix_B) # Both should be correct assert np.allclose(baseline_result, optimized_result, rtol=1e-5), "Optimized version should match baseline" speedup = baseline_time / optimized_time if optimized_time > 0 else 1.0 print(f"✅ Performance: baseline={baseline_time:.6f}s, optimized={optimized_time:.6f}s, speedup={speedup:.2f}x") except Exception as e: print(f"⚠️ Performance comparison: {e}") # Test 7: Memory efficiency print("💾 Testing memory efficiency...") # Test memory-efficient operations large_data = np.random.randn(1000, 1000).astype(np.float32) try: # Process in chunks to test memory efficiency chunk_results = [] chunk_size = 100 for i in range(0, large_data.shape[0], chunk_size): chunk = large_data[i:i+chunk_size] chunk_result = vectorized_relu(chunk.flatten()).reshape(chunk.shape) chunk_results.append(chunk_result) chunked_result = np.vstack(chunk_results) direct_result = vectorized_relu(large_data.flatten()).reshape(large_data.shape) assert np.allclose(chunked_result, direct_result, rtol=1e-5), "Chunked processing should match direct processing" print(f"✅ Memory efficiency: processed {large_data.shape} in {chunk_size}-row chunks") except Exception as e: print(f"⚠️ Memory efficiency: {e}") # Test 8: Integration with TinyTorch tensors print("🔗 Testing TinyTorch integration...") try: # Test that kernels work with TinyTorch tensors tensor_a = Tensor(np.random.randn(32, 32)) tensor_b = Tensor(np.random.randn(32, 32)) # Extract numpy arrays for kernel operations kernel_result = matmul_baseline(tensor_a.data, tensor_b.data) tensor_result = Tensor(kernel_result) assert tensor_result.shape == (32, 32), f"Tensor integration should preserve shape" print(f"✅ TinyTorch integration: kernels work with Tensor.data") except Exception as e: print(f"⚠️ TinyTorch integration: {e}") # Test 9: Kernel composition print("🧩 Testing kernel composition...") try: # Compose multiple kernel operations input_data = np.random.randn(64, 64).astype(np.float32) # Pipeline: MatMul → ReLU → Quantization intermediate = matmul_baseline(input_data, input_data.T) # Square result activated = vectorized_relu(intermediate.flatten()).reshape(intermediate.shape) quantized = quantized_relu(activated.flatten(), bits=8).reshape(activated.shape) assert quantized.shape == input_data.shape, f"Kernel pipeline should preserve dimensions" assert np.all(quantized >= 0), "Pipeline result should be non-negative after ReLU" print(f"✅ Kernel composition: MatMul → ReLU → Quantization pipeline") except Exception as e: print(f"⚠️ Kernel composition: {e}") # Test 10: Advanced optimization features print("🚁 Testing advanced optimizations...") try: # Test that optimization features are available medium_input = np.random.randn(256, 256).astype(np.float32) # Time multiple approaches approaches = [] # Baseline approach baseline_time, _ = time_kernel(np.dot, medium_input, medium_input.T) approaches.append(("NumPy baseline", baseline_time)) # Our optimized approach optimized_time, _ = time_kernel(cache_friendly_matmul, medium_input, medium_input.T) approaches.append(("Cache-friendly", optimized_time)) # Find fastest approach fastest = min(approaches, key=lambda x: x[1]) print(f"✅ Advanced optimizations: fastest approach is {fastest[0]} at {fastest[1]:.6f}s") # Verify we have meaningful optimization choices assert len(approaches) >= 2, "Should have multiple optimization approaches" except Exception as e: print(f"⚠️ Advanced optimizations: {e}") print("\n🎉 Kernels Complete!") print("📝 You can now implement high-performance computational kernels") print("🔧 Built capabilities: Timing, vectorization, parallelization, quantization, memory optimization") print("🧠 Breakthrough: You understand how to optimize ML operations for real-world performance!") print("🎯 Next: Add performance analysis and bottleneck identification") if __name__ == "__main__": test_checkpoint_12_kernels()