""" Checkpoint 7: Stability (After Module 8 - Normalization) Question: "Can I stabilize training with normalization techniques?" """ import numpy as np import pytest def test_checkpoint_07_stability(): """ Checkpoint 7: Stability Validates that students can apply normalization techniques to stabilize deep network training - the key to making deep learning practical and enabling training of very deep networks. """ print("\nβš–οΈ Checkpoint 7: Stability") print("=" * 50) try: from tinytorch.core.tensor import Tensor from tinytorch.core.normalization import BatchNorm1D, LayerNorm from tinytorch.core.layers import Dense from tinytorch.core.activations import ReLU except ImportError as e: pytest.fail(f"❌ Cannot import required classes - complete Modules 2-8 first: {e}") # Test 1: Batch normalization print("πŸ“Š Testing batch normalization...") batch_norm = BatchNorm1D(num_features=10) # Create batch of activations batch_data = Tensor(np.random.randn(32, 10) * 3 + 2) # High variance, non-zero mean normalized = batch_norm(batch_data) # Check normalization properties mean = np.mean(normalized.data, axis=0) std = np.std(normalized.data, axis=0) assert normalized.shape == batch_data.shape, f"BatchNorm should preserve shape: {batch_data.shape}" assert np.allclose(mean, 0, atol=1e-6), f"BatchNorm should center data around 0, got mean={mean}" assert np.allclose(std, 1, atol=1e-6), f"BatchNorm should normalize variance to 1, got std={std}" print(f"βœ… Batch normalization: {batch_data.shape} β†’ normalized (meanβ‰ˆ0, stdβ‰ˆ1)") # Test 2: Layer normalization print("πŸ”§ Testing layer normalization...") layer_norm = LayerNorm(normalized_shape=8) # Create sequence data (common in transformers) sequence_data = Tensor(np.random.randn(2, 5, 8) * 4 + 1) # batch=2, seq=5, features=8 layer_normalized = layer_norm(sequence_data) # Check that each sample/sequence position is normalized assert layer_normalized.shape == sequence_data.shape, f"LayerNorm should preserve shape: {sequence_data.shape}" # Check normalization across feature dimension for each position for b in range(2): for s in range(5): features = layer_normalized.data[b, s, :] assert abs(np.mean(features)) < 1e-5, f"LayerNorm should center features at position ({b},{s})" assert abs(np.std(features) - 1) < 1e-5, f"LayerNorm should normalize variance at position ({b},{s})" print(f"βœ… Layer normalization: {sequence_data.shape} β†’ normalized per position") # Test 3: Normalization in deep networks print("πŸ—οΈ Testing normalization in deep networks...") # Build deep network with normalization layers = [ Dense(16, 32), BatchNorm1D(32), ReLU(), Dense(32, 32), BatchNorm1D(32), ReLU(), Dense(32, 16), BatchNorm1D(16), ReLU(), Dense(16, 1) ] # Test forward pass through deep normalized network input_data = Tensor(np.random.randn(8, 16)) x = input_data for i, layer in enumerate(layers): x = layer(x) if i % 3 == 1: # After each BatchNorm # Check that activations are well-behaved assert not np.any(np.isnan(x.data)), f"No NaN after layer {i}" assert not np.any(np.isinf(x.data)), f"No Inf after layer {i}" assert x.shape == (8, 1), f"Deep network output should be (8, 1), got {x.shape}" print(f"βœ… Deep normalized network: {input_data.shape} β†’ 4 layers β†’ {x.shape}") # Test 4: Gradient flow improvement print("πŸ“ˆ Testing gradient flow properties...") # Compare networks with and without normalization # Create identical architectures normalized_net = [ Dense(10, 20), BatchNorm1D(20), ReLU(), Dense(20, 10), BatchNorm1D(10), ReLU(), Dense(10, 1) ] unnormalized_net = [ Dense(10, 20), ReLU(), Dense(20, 10), ReLU(), Dense(10, 1) ] test_input = Tensor(np.random.randn(5, 10)) # Forward pass through both networks norm_x = test_input for layer in normalized_net: norm_x = layer(norm_x) unnorm_x = test_input for layer in unnormalized_net: unnorm_x = layer(unnorm_x) # Both should produce valid outputs assert not np.any(np.isnan(norm_x.data)), "Normalized network should produce stable outputs" assert not np.any(np.isnan(unnorm_x.data)), "Unnormalized network should produce valid outputs" print(f"βœ… Gradient flow: normalized and unnormalized networks both stable") # Test 5: Training vs inference modes print("πŸ”„ Testing training vs inference modes...") # Create batch norm layer bn = BatchNorm1D(num_features=5) # Training mode: use batch statistics training_data = Tensor(np.random.randn(10, 5) * 2 + 1) if hasattr(bn, 'training'): bn.training = True train_output = bn(training_data) # Should normalize based on current batch train_mean = np.mean(train_output.data, axis=0) assert np.allclose(train_mean, 0, atol=1e-5), "Training mode should use batch statistics" # Inference mode: use running statistics (if implemented) if hasattr(bn, 'training'): bn.training = False # Single sample inference single_sample = Tensor(np.random.randn(1, 5)) inference_output = bn(single_sample) assert inference_output.shape == (1, 5), f"Inference should work on single samples: {inference_output.shape}" print(f"βœ… Mode switching: training and inference modes both functional") # Test 6: Learnable parameters in normalization print("πŸ“š Testing learnable normalization parameters...") # Check that normalization layers have learnable parameters bn_with_params = BatchNorm1D(num_features=8) assert hasattr(bn_with_params, 'gamma') or hasattr(bn_with_params, 'weight'), "BatchNorm should have scale parameters" assert hasattr(bn_with_params, 'beta') or hasattr(bn_with_params, 'bias'), "BatchNorm should have shift parameters" # Test that parameters affect output test_data = Tensor(np.ones((4, 8))) # All ones original_output = bn_with_params(test_data) # Modify parameters if hasattr(bn_with_params, 'gamma'): bn_with_params.gamma.data *= 2 bn_with_params.beta.data += 1 elif hasattr(bn_with_params, 'weight'): bn_with_params.weight.data *= 2 bn_with_params.bias.data += 1 modified_output = bn_with_params(test_data) # Output should change when parameters change assert not np.allclose(original_output.data, modified_output.data), "Learnable parameters should affect output" print(f"βœ… Learnable parameters: scale and shift parameters modify normalization") # Test 7: Numerical stability print("πŸ”’ Testing numerical stability...") # Test with extreme values extreme_data = Tensor(np.array([[1e6, -1e6, 1e-6, -1e-6, 0]])) stable_bn = BatchNorm1D(num_features=5) try: stable_output = stable_bn(extreme_data) assert not np.any(np.isnan(stable_output.data)), "Should handle extreme values without NaN" assert not np.any(np.isinf(stable_output.data)), "Should handle extreme values without Inf" print(f"βœ… Numerical stability: handles extreme values β†’ {stable_output.shape}") except Exception as e: print(f"⚠️ Numerical stability: some issues with extreme values ({e})") print("\nπŸŽ‰ Stability Complete!") print("πŸ“ You can now stabilize training with normalization techniques") print("πŸ”§ Built capabilities: Batch normalization, layer normalization, stable deep networks") print("🧠 Breakthrough: You can now train deep networks reliably!") print("🎯 Next: Add automatic differentiation for learning") if __name__ == "__main__": test_checkpoint_07_stability()