""" Module 13: Progressive Integration Tests Tests that Module 13 (Kernels) works correctly AND that the entire prior stack works. DEPENDENCY CHAIN: 01_setup → ... → 12_compression → 13_kernels This is where we enable high-performance computational kernels and hardware acceleration. """ import numpy as np import sys from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) class TestPriorStackStillWorking: """Quick regression checks that prior modules (01→12) still work.""" def test_complete_ml_system_stable(self): """Verify complete ML system remains stable.""" # Environment (Module 01) assert sys.version_info >= (3, 8), "Foundation broken: Python version" # Complete ML system should work try: from tinytorch.core.tensor import Tensor from tinytorch.core.layers import Dense from tinytorch.core.optimizers import Adam from tinytorch.core.training import Trainer from tinytorch.core.compression import prune_weights # All ML system components should be available model = Dense(10, 5) optimizer = Adam(model.parameters(), lr=0.001) trainer = Trainer(model, optimizer) # Compression should still work if 'prune_weights' in locals(): pruned_weights = prune_weights(model.weights, sparsity=0.3) assert pruned_weights.shape == model.weights.shape, "Compression broken" # Basic ML functionality should work x = Tensor(np.random.randn(4, 10)) output = model(x) assert output.shape == (4, 5), "ML system broken" except ImportError: assert True, "ML system not implemented yet" def test_efficiency_features_stable(self): """Verify efficiency modules (11→12) still work.""" try: from tinytorch.core.training import Trainer from tinytorch.core.compression import quantize_weights from tinytorch.core.optimizers import SGD from tinytorch.core.layers import Dense # Efficiency features should work model = Dense(8, 3) optimizer = SGD(model.parameters(), lr=0.01) trainer = Trainer(model, optimizer) assert hasattr(trainer, 'train') or hasattr(trainer, 'fit'), "Training broken" # Compression should work if 'quantize_weights' in locals(): quantized = quantize_weights(model.weights, bits=8) assert quantized.shape == model.weights.shape, "Quantization broken" except ImportError: assert True, "Efficiency features not implemented yet" class TestModule13KernelsCore: """Test Module 13 (Kernels) core functionality.""" def test_optimized_tensor_operations(self): """Test optimized tensor operation kernels.""" try: from tinytorch.core.kernels import optimized_matmul, vectorized_add from tinytorch.core.tensor import Tensor # Test optimized matrix multiplication if 'optimized_matmul' in locals(): A = Tensor(np.random.randn(50, 30)) B = Tensor(np.random.randn(30, 20)) result = optimized_matmul(A, B) expected = np.dot(A.data, B.data) assert result.shape == (50, 20), "Optimized matmul shape broken" assert np.allclose(result.data, expected, rtol=1e-5), "Optimized matmul accuracy broken" # Test vectorized operations if 'vectorized_add' in locals(): a = Tensor(np.random.randn(1000)) b = Tensor(np.random.randn(1000)) result = vectorized_add(a, b) expected = a.data + b.data assert result.shape == a.shape, "Vectorized add shape broken" assert np.allclose(result.data, expected), "Vectorized add accuracy broken" except ImportError: assert True, "Optimized tensor operations not implemented yet" def test_cuda_kernels(self): """Test CUDA acceleration kernels.""" try: from tinytorch.core.kernels import cuda_available, CudaKernel from tinytorch.core.tensor import Tensor # Check CUDA availability if 'cuda_available' in locals(): has_cuda = cuda_available() if has_cuda: # Test CUDA tensor operations if 'CudaKernel' in locals(): kernel = CudaKernel('matmul') A = Tensor(np.random.randn(100, 50)) B = Tensor(np.random.randn(50, 25)) # Move to CUDA (if supported) if hasattr(A, 'cuda'): A_cuda = A.cuda() B_cuda = B.cuda() result = kernel.execute(A_cuda, B_cuda) assert result.shape == (100, 25), "CUDA kernel shape broken" else: # CPU fallback should work assert True, "CUDA not available, CPU fallback used" except ImportError: assert True, "CUDA kernels not implemented yet" def test_custom_kernel_compilation(self): """Test custom kernel compilation and execution.""" try: from tinytorch.core.kernels import compile_kernel, KernelCompiler # Test kernel compilation if 'compile_kernel' in locals(): # Simple element-wise operation kernel kernel_code = """ def element_wise_multiply(a, b): return a * b """ compiled_kernel = compile_kernel(kernel_code, 'element_wise_multiply') # Test compiled kernel a = np.array([1, 2, 3, 4]) b = np.array([2, 3, 4, 5]) result = compiled_kernel(a, b) expected = a * b assert np.array_equal(result, expected), "Custom kernel compilation broken" # Test kernel compiler if 'KernelCompiler' in locals(): compiler = KernelCompiler(target='cpu', optimization_level=2) assert hasattr(compiler, 'compile'), "Kernel compiler broken: No compile method" assert hasattr(compiler, 'target'), "Kernel compiler broken: No target" except ImportError: assert True, "Custom kernel compilation not implemented yet" class TestProgressiveStackIntegration: """Test that the complete stack (01→13) works together.""" def test_accelerated_training_pipeline(self): """Test training pipeline with kernel acceleration.""" try: from tinytorch.core.tensor import Tensor from tinytorch.core.layers import Dense from tinytorch.core.optimizers import Adam from tinytorch.core.training import Trainer from tinytorch.core.kernels import enable_optimizations from tinytorch.core.data import Dataset, DataLoader # Enable kernel optimizations if 'enable_optimizations' in locals(): enable_optimizations(backend='auto') # Create accelerated training pipeline class AcceleratedModel: def __init__(self): self.layer1 = Dense(50, 100) self.layer2 = Dense(100, 20) self.layer3 = Dense(20, 5) def __call__(self, x): h1 = self.layer1(x) h2 = self.layer2(h1) return self.layer3(h2) def parameters(self): params = [] for layer in [self.layer1, self.layer2, self.layer3]: if hasattr(layer, 'parameters'): params.extend(layer.parameters()) return params # Dataset for performance testing class PerformanceDataset(Dataset): def __init__(self): self.data = np.random.randn(200, 50) self.targets = np.random.randint(0, 5, 200) def __len__(self): return 200 def __getitem__(self, idx): return Tensor(self.data[idx]), self.targets[idx] # Accelerated training model = AcceleratedModel() optimizer = Adam(model.parameters(), lr=0.001) trainer = Trainer(model, optimizer) dataset = PerformanceDataset() dataloader = DataLoader(dataset, batch_size=16) # Test accelerated forward pass for batch_x, batch_y in dataloader: output = model(batch_x) assert output.shape == (16, 5), "Accelerated training broken" break # Test one batch except ImportError: assert True, "Accelerated training pipeline not ready yet" def test_large_scale_operations(self): """Test large-scale operations with kernel optimizations.""" try: from tinytorch.core.kernels import optimized_matmul, batch_operations from tinytorch.core.tensor import Tensor # Large-scale matrix operations if 'optimized_matmul' in locals(): # Large matrices A = Tensor(np.random.randn(500, 300)) B = Tensor(np.random.randn(300, 200)) result = optimized_matmul(A, B) assert result.shape == (500, 200), "Large-scale matmul broken" # Batch operations if 'batch_operations' in locals(): # Batch of operations batch_size = 32 matrices = [Tensor(np.random.randn(50, 30)) for _ in range(batch_size)] vectors = [Tensor(np.random.randn(30)) for _ in range(batch_size)] results = batch_operations('matmul', matrices, vectors) assert len(results) == batch_size, "Batch operations broken" for result in results: assert result.shape == (50,), "Batch operation result shape broken" except ImportError: assert True, "Large-scale operations not ready yet" def test_memory_optimized_operations(self): """Test memory-optimized kernel operations.""" try: from tinytorch.core.kernels import in_place_operations, memory_pool from tinytorch.core.tensor import Tensor # In-place operations to save memory if 'in_place_operations' in locals(): a = Tensor(np.random.randn(100, 100)) b = Tensor(np.random.randn(100, 100)) original_id = id(a.data) # In-place addition in_place_operations.add_(a, b) # Should modify original tensor assert id(a.data) == original_id, "In-place operation created copy" # Memory pool for efficient allocation if 'memory_pool' in locals(): pool = memory_pool.MemoryPool() # Allocate from pool tensor1 = pool.allocate_tensor(shape=(200, 200)) tensor2 = pool.allocate_tensor(shape=(200, 200)) # Should be memory efficient assert tensor1.shape == (200, 200), "Memory pool allocation broken" assert tensor2.shape == (200, 200), "Memory pool allocation broken" # Release memory pool.release(tensor1) pool.release(tensor2) except ImportError: assert True, "Memory-optimized operations not ready yet" class TestPerformanceOptimizations: """Test performance optimizations and benchmarking.""" def test_kernel_benchmarking(self): """Test kernel performance benchmarking.""" try: from tinytorch.core.kernels import benchmark_kernel, KernelProfiler import time # Benchmark matrix multiplication if 'benchmark_kernel' in locals(): sizes = [(100, 100), (200, 200), (500, 500)] for size in sizes: A = np.random.randn(*size) B = np.random.randn(*size) # Benchmark different implementations results = benchmark_kernel('matmul', A, B, num_trials=5) assert 'mean_time' in results, "Benchmark missing timing" assert 'std_time' in results, "Benchmark missing std" assert results['mean_time'] > 0, "Benchmark timing invalid" # Kernel profiler if 'KernelProfiler' in locals(): profiler = KernelProfiler() # Profile operations profiler.start() # Some operations to profile for _ in range(10): a = np.random.randn(50, 50) b = np.random.randn(50, 50) c = np.dot(a, b) profile_results = profiler.stop() assert 'total_time' in profile_results, "Profiler missing total time" assert 'operation_count' in profile_results, "Profiler missing operation count" except ImportError: assert True, "Kernel benchmarking not ready yet" def test_auto_optimization(self): """Test automatic kernel optimization selection.""" try: from tinytorch.core.kernels import AutoOptimizer, select_best_kernel # Auto optimizer if 'AutoOptimizer' in locals(): optimizer = AutoOptimizer() # Should detect best kernels for hardware best_config = optimizer.detect_optimal_config() assert 'matmul_kernel' in best_config, "Auto optimizer missing matmul" assert 'device' in best_config, "Auto optimizer missing device" # Kernel selection if 'select_best_kernel' in locals(): # Test different kernel options for operation kernels = ['numpy', 'optimized_cpu', 'cuda'] operation = 'matmul' shape = (100, 100) best_kernel = select_best_kernel(operation, shape, available_kernels=kernels) assert best_kernel in kernels, "Kernel selection invalid" except ImportError: assert True, "Auto optimization not ready yet" def test_vectorization_optimizations(self): """Test vectorization and SIMD optimizations.""" try: from tinytorch.core.kernels import vectorized_ops, simd_support # Vectorized operations if 'vectorized_ops' in locals(): # Large arrays for vectorization a = np.random.randn(10000) b = np.random.randn(10000) # Vectorized operations should be faster import time # Time numpy baseline start = time.time() numpy_result = a + b numpy_time = time.time() - start # Time vectorized version start = time.time() vectorized_result = vectorized_ops.add(a, b) vectorized_time = time.time() - start # Results should be equivalent assert np.allclose(numpy_result, vectorized_result), "Vectorization accuracy broken" # Vectorized should be competitive or faster assert vectorized_time <= numpy_time * 2, "Vectorization significantly slower" # SIMD support detection if 'simd_support' in locals(): capabilities = simd_support.detect_capabilities() assert isinstance(capabilities, dict), "SIMD detection should return dict" # Common SIMD instruction sets expected_keys = ['sse', 'avx', 'avx2'] for key in expected_keys: if key in capabilities: assert isinstance(capabilities[key], bool), f"SIMD {key} should be boolean" except ImportError: assert True, "Vectorization optimizations not ready yet" class TestHardwareAcceleration: """Test hardware acceleration and device management.""" def test_device_detection(self): """Test hardware device detection and selection.""" try: from tinytorch.core.kernels import Device, get_available_devices # Device detection if 'get_available_devices' in locals(): devices = get_available_devices() assert isinstance(devices, list), "Available devices should be list" assert len(devices) > 0, "Should detect at least CPU" # Should include CPU at minimum device_types = [device.type for device in devices] assert 'cpu' in device_types, "CPU device not detected" # Device object if 'Device' in locals(): cpu_device = Device('cpu') assert cpu_device.type == 'cpu', "CPU device creation broken" # Test CUDA device if available try: cuda_device = Device('cuda:0') assert cuda_device.type == 'cuda', "CUDA device creation broken" except RuntimeError: # CUDA not available, which is fine assert True, "CUDA not available on this system" except ImportError: assert True, "Device detection not ready yet" def test_tensor_device_movement(self): """Test moving tensors between devices.""" try: from tinytorch.core.tensor import Tensor from tinytorch.core.kernels import Device # Create tensor on CPU tensor = Tensor(np.random.randn(50, 50)) # Should start on CPU if hasattr(tensor, 'device'): assert tensor.device.type == 'cpu', "Tensor not starting on CPU" # Test moving to different device (if available) if hasattr(tensor, 'to'): # Try moving to CUDA (will fallback to CPU if not available) try: cuda_tensor = tensor.to('cuda') if hasattr(cuda_tensor, 'device'): assert cuda_tensor.device.type in ['cuda', 'cpu'], "Device movement broken" except RuntimeError: # CUDA not available assert True, "CUDA not available for tensor movement" except ImportError: assert True, "Tensor device movement not ready yet" def test_multi_gpu_support(self): """Test multi-GPU support and parallelization.""" try: from tinytorch.core.kernels import MultiGPUManager, data_parallel # Multi-GPU manager if 'MultiGPUManager' in locals(): gpu_manager = MultiGPUManager() available_gpus = gpu_manager.get_gpu_count() if available_gpus > 1: # Test multi-GPU operations assert available_gpus >= 2, "Multi-GPU testing requires 2+ GPUs" # Should be able to manage multiple devices devices = gpu_manager.get_device_list() assert len(devices) == available_gpus, "GPU device list incorrect" else: # Single GPU or CPU only assert True, "Multi-GPU not available, single device mode" # Data parallel operations if 'data_parallel' in locals(): # Test data parallel wrapper from tinytorch.core.layers import Dense model = Dense(10, 5) parallel_model = data_parallel(model, device_ids=[0]) # Single device for testing assert hasattr(parallel_model, 'forward'), "Data parallel wrapper broken" except ImportError: assert True, "Multi-GPU support not ready yet" class TestRegressionPrevention: """Ensure previous modules still work after Module 13 development.""" def test_no_complete_system_regression(self): """Verify complete ML system (01→12) unchanged.""" # Core functionality should remain stable assert sys.version_info.major >= 3, "Foundation: Python detection broken" # Complete ML system should still work try: from tinytorch.core.tensor import Tensor from tinytorch.core.layers import Dense from tinytorch.core.optimizers import Adam from tinytorch.core.training import Trainer from tinytorch.core.compression import prune_weights # All components should work together model = Dense(8, 4) optimizer = Adam(model.parameters(), lr=0.001) trainer = Trainer(model, optimizer) x = Tensor(np.random.randn(2, 8)) output = model(x) assert output.shape == (2, 4), "System regression: Forward pass broken" # Compression should still work if 'prune_weights' in locals(): pruned = prune_weights(model.weights, sparsity=0.2) assert pruned.shape == model.weights.shape, "System regression: Compression broken" except ImportError: import numpy as np assert np.random is not None, "System regression: Basic functionality broken" def test_no_efficiency_regression(self): """Verify efficiency features (11→12) unchanged.""" try: from tinytorch.core.training import Trainer from tinytorch.core.compression import quantize_weights from tinytorch.core.optimizers import SGD from tinytorch.core.layers import Dense # Efficiency features should still work model = Dense(6, 3) optimizer = SGD(model.parameters(), lr=0.01) trainer = Trainer(model, optimizer) assert hasattr(trainer, 'train') or hasattr(trainer, 'fit'), "Efficiency regression: Training broken" # Compression should still work if 'quantize_weights' in locals(): quantized = quantize_weights(model.weights, bits=8) assert quantized.shape == model.weights.shape, "Efficiency regression: Quantization broken" except ImportError: # Basic functionality should work import numpy as np assert np is not None, "Efficiency regression: Basic functionality broken" def test_progressive_stability(self): """Test the progressive stack is stable through kernel optimization.""" # Stack should be stable through: Setup → ... → Compression → Kernels # Setup level import numpy as np assert np is not None, "Setup level broken" # Complete ML system level (if available) try: from tinytorch.core.tensor import Tensor from tinytorch.core.layers import Dense from tinytorch.core.optimizers import Adam from tinytorch.core.training import Trainer # Complete system should work model = Dense(10, 5) optimizer = Adam(model.parameters(), lr=0.001) trainer = Trainer(model, optimizer) x = Tensor(np.random.randn(3, 10)) output = model(x) assert output.shape == (3, 5), "ML system level broken" except ImportError: pass # Not implemented yet # Kernel optimization level (if available) try: from tinytorch.core.kernels import optimized_matmul # Kernel optimizations should work with existing tensors if 'optimized_matmul' in locals(): A = np.random.randn(20, 15) B = np.random.randn(15, 10) result = optimized_matmul(A, B) assert result.shape == (20, 10), "Kernel optimization level broken" else: # Basic kernel concepts should work assert True, "Basic kernel optimization ready" except ImportError: pass # Not implemented yet