""" Performance Tests for Module 17: Quantization Tests whether quantization actually provides the claimed 4ร— speedup and memory reduction with <1% accuracy loss. Key questions: - Does INT8 quantization actually reduce memory by 4ร—? - Is there a real inference speedup from quantization? - Is accuracy loss actually <1% as claimed? - Does quantization work on realistic CNN models? """ import sys import os import time import numpy as np from pathlib import Path # Add the performance framework to path sys.path.append(str(Path(__file__).parent)) from performance_test_framework import PerformanceTestSuite, PerformanceComparator, WorkloadGenerator # Add module path sys.path.append(str(Path(__file__).parent.parent.parent / 'modules' / '17_quantization')) try: from quantization_dev import ( BaselineCNN, QuantizedCNN, INT8Quantizer, QuantizationPerformanceAnalyzer, QuantizationSystemsAnalyzer, QuantizedConv2d ) QUANTIZATION_AVAILABLE = True except ImportError: print("โŒ Module 17 quantization tools not available") QUANTIZATION_AVAILABLE = False class Module17PerformanceTests: """Test suite for Module 17 quantization techniques.""" def __init__(self): self.suite = PerformanceTestSuite() self.comparator = PerformanceComparator() self.workloads = WorkloadGenerator() def test_memory_reduction(self): """Test whether quantization actually reduces memory by 4ร—.""" if not QUANTIZATION_AVAILABLE: return "Quantization module not available" print("๐Ÿ’พ Testing memory reduction from quantization") # Create models baseline_model = BaselineCNN(input_channels=3, num_classes=10) quantized_model = QuantizedCNN(input_channels=3, num_classes=10) # Quantize the model calibration_data = [np.random.randn(1, 3, 32, 32) for _ in range(5)] quantized_model.calibrate_and_quantize(calibration_data) # Measure memory usage def calculate_model_memory(model): """Calculate memory usage of model parameters.""" total_bytes = 0 # Baseline model memory if hasattr(model, 'conv1_weight'): total_bytes += model.conv1_weight.nbytes + model.conv1_bias.nbytes total_bytes += model.conv2_weight.nbytes + model.conv2_bias.nbytes total_bytes += model.fc.nbytes # Quantized model memory elif hasattr(model, 'conv1'): # Conv layers if hasattr(model.conv1, 'weight_quantized') and model.conv1.is_quantized: total_bytes += model.conv1.weight_quantized.nbytes else: total_bytes += model.conv1.weight_fp32.nbytes if hasattr(model.conv2, 'weight_quantized') and model.conv2.is_quantized: total_bytes += model.conv2.weight_quantized.nbytes else: total_bytes += model.conv2.weight_fp32.nbytes # FC layer total_bytes += model.fc.nbytes return total_bytes / (1024 * 1024) # Convert to MB baseline_memory_mb = calculate_model_memory(baseline_model) quantized_memory_mb = calculate_model_memory(quantized_model) memory_reduction = baseline_memory_mb / quantized_memory_mb # Check if we achieved close to 4ร— reduction # Note: Only conv layers are quantized, FC layer remains FP32 conv_portion = 0.7 # Approximately 70% of model is conv weights expected_reduction = 1 / (conv_portion * 0.25 + (1 - conv_portion) * 1.0) # ~2.3ร— memory_test_passed = memory_reduction > 1.8 # At least some reduction result = { 'baseline_memory_mb': baseline_memory_mb, 'quantized_memory_mb': quantized_memory_mb, 'memory_reduction': memory_reduction, 'expected_reduction': expected_reduction, 'memory_test_passed': memory_test_passed } if memory_test_passed: print(f"โœ… Memory reduction achieved: {memory_reduction:.2f}ร— reduction") else: print(f"โŒ Insufficient memory reduction: {memory_reduction:.2f}ร— reduction") return result def test_inference_speedup(self): """Test whether quantized inference is actually faster.""" if not QUANTIZATION_AVAILABLE: return "Quantization module not available" print("๐Ÿš€ Testing inference speedup from quantization") # Create models baseline_model = BaselineCNN(input_channels=3, num_classes=10) quantized_model = QuantizedCNN(input_channels=3, num_classes=10) # Quantize the model calibration_data = [np.random.randn(1, 3, 32, 32) for _ in range(5)] quantized_model.calibrate_and_quantize(calibration_data) # Create test input test_input = np.random.randn(4, 3, 32, 32) # Wrapper functions for timing def baseline_inference(): return baseline_model.forward(test_input) def quantized_inference(): return quantized_model.forward(test_input) # Verify results are close try: baseline_output = baseline_inference() quantized_output = quantized_inference() # Check if outputs are reasonably close output_close = np.allclose(baseline_output, quantized_output, rtol=0.1, atol=0.1) if not output_close: print("โš ๏ธ Warning: Quantized output differs significantly from baseline") except Exception as e: return f"Inference test error: {e}" # Performance comparison comparison = self.comparator.compare_implementations( baseline_inference, quantized_inference, baseline_name="fp32_inference", optimized_name="int8_inference" ) # Note: Educational quantization may not show speedup without real INT8 kernels # We'll consider any improvement or small regression as acceptable reasonable_performance = comparison.speedup > 0.5 # Within 2ร— slower result = { 'speedup': comparison.speedup, 'reasonable_performance': reasonable_performance, 'baseline_time_ms': comparison.baseline.mean_time_ms, 'quantized_time_ms': comparison.optimized.mean_time_ms, 'outputs_close': output_close } if comparison.speedup > 1.1: print(f"๐ŸŽ‰ Quantization speedup achieved: {comparison.speedup:.2f}ร—") elif reasonable_performance: print(f"โœ… Quantization performance reasonable: {comparison.speedup:.2f}ร—") print(" (Educational implementation - production would use INT8 kernels)") else: print(f"โŒ Quantization performance poor: {comparison.speedup:.2f}ร—") return comparison def test_accuracy_preservation(self): """Test whether quantization preserves accuracy as claimed (<1% loss).""" if not QUANTIZATION_AVAILABLE: return "Quantization module not available" print("๐ŸŽฏ Testing accuracy preservation in quantization") # Create models baseline_model = BaselineCNN(input_channels=3, num_classes=10) quantized_model = QuantizedCNN(input_channels=3, num_classes=10) # Copy weights from baseline to quantized before quantization quantized_model.conv1.weight_fp32 = baseline_model.conv1_weight.copy() quantized_model.conv1.bias = baseline_model.conv1_bias.copy() quantized_model.conv2.weight_fp32 = baseline_model.conv2_weight.copy() quantized_model.conv2.bias = baseline_model.conv2_bias.copy() quantized_model.fc = baseline_model.fc.copy() # Generate test dataset test_size = 100 test_inputs = np.random.randn(test_size, 3, 32, 32) # Get baseline predictions baseline_outputs = baseline_model.forward(test_inputs) baseline_predictions = np.argmax(baseline_outputs, axis=1) # Quantize model calibration_data = [test_inputs[:5]] # Use some test data for calibration quantized_model.calibrate_and_quantize(calibration_data) # Get quantized predictions quantized_outputs = quantized_model.forward(test_inputs) quantized_predictions = np.argmax(quantized_outputs, axis=1) # Calculate accuracy metrics prediction_agreement = np.mean(baseline_predictions == quantized_predictions) output_mse = np.mean((baseline_outputs - quantized_outputs) ** 2) output_mae = np.mean(np.abs(baseline_outputs - quantized_outputs)) # Check accuracy preservation high_agreement = prediction_agreement > 0.95 # 95%+ predictions should match low_output_difference = output_mae < 1.0 # Mean absolute error < 1.0 accuracy_preserved = high_agreement and low_output_difference result = { 'prediction_agreement': prediction_agreement, 'output_mse': output_mse, 'output_mae': output_mae, 'high_agreement': high_agreement, 'low_output_difference': low_output_difference, 'accuracy_preserved': accuracy_preserved, 'test_samples': test_size } if accuracy_preserved: print(f"โœ… Accuracy preserved: {prediction_agreement:.1%} agreement, {output_mae:.3f} MAE") else: print(f"โŒ Accuracy degraded: {prediction_agreement:.1%} agreement, {output_mae:.3f} MAE") return result def test_quantization_precision(self): """Test the accuracy of the quantization/dequantization process.""" if not QUANTIZATION_AVAILABLE: return "Quantization module not available" print("๐Ÿ”ฌ Testing quantization precision") quantizer = INT8Quantizer() # Test on different types of data test_cases = [ ("small_weights", np.random.randn(100, 100) * 0.1), ("large_weights", np.random.randn(100, 100) * 2.0), ("uniform_weights", np.random.uniform(-1, 1, (100, 100))), ("sparse_weights", np.random.randn(100, 100) * 0.01) ] precision_results = {} for name, weights in test_cases: # Quantize and dequantize scale, zero_point = quantizer.compute_quantization_params(weights) quantized = quantizer.quantize_tensor(weights, scale, zero_point) dequantized = quantizer.dequantize_tensor(quantized, scale, zero_point) # Calculate precision metrics mse = np.mean((weights - dequantized) ** 2) mae = np.mean(np.abs(weights - dequantized)) max_error = np.max(np.abs(weights - dequantized)) # Relative error weight_range = np.max(weights) - np.min(weights) relative_mae = mae / weight_range if weight_range > 0 else 0 precision_results[name] = { 'mse': mse, 'mae': mae, 'max_error': max_error, 'relative_mae': relative_mae, 'good_precision': relative_mae < 0.02 # < 2% relative error } print(f" {name}: MAE={mae:.4f}, relative={relative_mae:.1%}") # Overall precision test all_good_precision = all(result['good_precision'] for result in precision_results.values()) result = { 'test_cases': precision_results, 'all_good_precision': all_good_precision } if all_good_precision: print("โœ… Quantization precision good across all test cases") else: print("โŒ Quantization precision issues detected") return result def test_systems_analysis_accuracy(self): """Test whether the systems analysis tools provide accurate assessments.""" if not QUANTIZATION_AVAILABLE: return "Quantization module not available" print("๐Ÿ“Š Testing systems analysis accuracy") try: analyzer = QuantizationSystemsAnalyzer() # Test precision vs performance analysis analysis = analyzer.analyze_precision_tradeoffs([32, 16, 8, 4]) # Validate analysis structure required_keys = ['compute_efficiency', 'typical_accuracy_loss', 'memory_per_param'] has_required_keys = all(key in analysis for key in required_keys) # Validate logical relationships memory_decreases = all(analysis['memory_per_param'][i] >= analysis['memory_per_param'][i+1] for i in range(len(analysis['memory_per_param'])-1)) accuracy_loss_increases = all(analysis['typical_accuracy_loss'][i] <= analysis['typical_accuracy_loss'][i+1] for i in range(len(analysis['typical_accuracy_loss'])-1)) # Check if INT8 is identified as optimal efficiency_ratios = [s / (1 + a) for s, a in zip(analysis['compute_efficiency'], analysis['typical_accuracy_loss'])] optimal_idx = np.argmax(efficiency_ratios) optimal_bits = analysis['bit_widths'][optimal_idx] int8_optimal = optimal_bits == 8 analysis_result = { 'has_required_keys': has_required_keys, 'memory_decreases_correctly': memory_decreases, 'accuracy_loss_increases_correctly': accuracy_loss_increases, 'int8_identified_as_optimal': int8_optimal, 'optimal_bits': optimal_bits, 'analysis_logical': has_required_keys and memory_decreases and accuracy_loss_increases } if analysis_result['analysis_logical'] and int8_optimal: print("โœ… Systems analysis provides logical and accurate assessments") else: print("โŒ Systems analysis has logical inconsistencies") return analysis_result except Exception as e: return f"Systems analysis error: {e}" def test_quantization_performance_analyzer(self): """Test the quantization performance analyzer tool.""" if not QUANTIZATION_AVAILABLE: return "Quantization module not available" print("๐Ÿ“ˆ Testing quantization performance analyzer") try: # Create models baseline_model = BaselineCNN(input_channels=3, num_classes=10) quantized_model = QuantizedCNN(input_channels=3, num_classes=10) # Quantize model calibration_data = [np.random.randn(1, 3, 32, 32) for _ in range(3)] quantized_model.calibrate_and_quantize(calibration_data) # Test data test_data = np.random.randn(4, 3, 32, 32) # Use the performance analyzer analyzer = QuantizationPerformanceAnalyzer() results = analyzer.benchmark_models(baseline_model, quantized_model, test_data, num_runs=5) # Validate analyzer results required_metrics = ['memory_reduction', 'speedup', 'prediction_agreement'] has_required_metrics = all(metric in results for metric in required_metrics) reasonable_values = ( results['memory_reduction'] > 1.0 and results['speedup'] > 0.1 and # May be slower in educational implementation results['prediction_agreement'] >= 0.0 ) analyzer_result = { 'has_required_metrics': has_required_metrics, 'reasonable_values': reasonable_values, 'memory_reduction': results['memory_reduction'], 'speedup': results['speedup'], 'prediction_agreement': results['prediction_agreement'], 'analyzer_working': has_required_metrics and reasonable_values } if analyzer_result['analyzer_working']: print(f"โœ… Performance analyzer working: {results['memory_reduction']:.1f}ร— memory, " f"{results['speedup']:.1f}ร— speed, {results['prediction_agreement']:.1%} agreement") else: print("โŒ Performance analyzer has issues") return analyzer_result except Exception as e: return f"Performance analyzer error: {e}" def run_module_17_performance_tests(): """Run all performance tests for Module 17.""" print("๐Ÿงช TESTING MODULE 17: QUANTIZATION") print("=" * 60) print("Verifying that quantization provides real benefits with minimal accuracy loss") if not QUANTIZATION_AVAILABLE: print("โŒ Cannot test Module 17 - quantization tools not available") return test_suite = Module17PerformanceTests() tests = { 'memory_reduction': test_suite.test_memory_reduction, 'inference_speedup': test_suite.test_inference_speedup, 'accuracy_preservation': test_suite.test_accuracy_preservation, 'quantization_precision': test_suite.test_quantization_precision, 'systems_analysis': test_suite.test_systems_analysis_accuracy, 'performance_analyzer': test_suite.test_quantization_performance_analyzer } results = test_suite.suite.run_module_tests('module_17_quantization', tests) # Summary print(f"\n๐Ÿ“Š MODULE 17 TEST SUMMARY") print("=" * 40) total_tests = len(tests) passed_tests = 0 key_metrics = {} for test_name, result in results.items(): if hasattr(result, 'speedup'): # ComparisonResult passed = result.speedup > 0.8 # Allow some performance variation key_metrics[f'{test_name}_speedup'] = result.speedup elif isinstance(result, dict): # Check specific success criteria for each test if 'memory_test_passed' in result: passed = result['memory_test_passed'] key_metrics['memory_reduction'] = result.get('memory_reduction', 0) elif 'reasonable_performance' in result: passed = result['reasonable_performance'] elif 'accuracy_preserved' in result: passed = result['accuracy_preserved'] key_metrics['prediction_agreement'] = result.get('prediction_agreement', 0) elif 'all_good_precision' in result: passed = result['all_good_precision'] elif 'analysis_logical' in result: passed = result['analysis_logical'] and result.get('int8_identified_as_optimal', False) elif 'analyzer_working' in result: passed = result['analyzer_working'] else: passed = False else: passed = False if passed: passed_tests += 1 print(f"โœ… {test_name}: PASSED") else: print(f"โŒ {test_name}: FAILED") success_rate = passed_tests / total_tests print(f"\nSUCCESS RATE: {success_rate:.1%} ({passed_tests}/{total_tests})") # Key insights if 'memory_reduction' in key_metrics: print(f"๐Ÿ“Š Memory reduction: {key_metrics['memory_reduction']:.2f}ร—") if 'prediction_agreement' in key_metrics: print(f"๐ŸŽฏ Accuracy preservation: {key_metrics['prediction_agreement']:.1%}") if success_rate >= 0.7: print("๐ŸŽ‰ Module 17 quantization is working effectively!") print("๐Ÿ’ก Note: Performance gains depend on hardware INT8 support") else: print("โš ๏ธ Module 17 quantization needs improvement") return results if __name__ == "__main__": run_module_17_performance_tests()