# AUTOGENERATED FROM modules/17_quantization/quantization_dev.py # This file was generated manually due to directory structure reorganization __all__ = ['BaselineCNN', 'INT8Quantizer', 'QuantizedConv2d', 'QuantizedCNN', 'QuantizationPerformanceAnalyzer', 'QuantizationSystemsAnalyzer', 'QuantizationMemoryProfiler', 'ProductionQuantizationInsights'] import math import time import numpy as np import sys import os from typing import Union, List, Optional, Tuple, Dict, Any # Import from the main package - try package first, then local modules try: from tinytorch.core.tensor import Tensor from tinytorch.core.spatial import Conv2d, MaxPool2D MaxPool2d = MaxPool2D # Alias for consistent naming except ImportError: # For development, import from local modules sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) sys.path.append(os.path.join(os.path.dirname(__file__), '..', '06_spatial')) try: from tensor_dev import Tensor from spatial_dev import Conv2d, MaxPool2D MaxPool2d = MaxPool2D # Alias for consistent naming except ImportError: # Create minimal mock classes if not available class Tensor: def __init__(self, data): self.data = np.array(data) self.shape = self.data.shape class Conv2d: def __init__(self, in_channels, out_channels, kernel_size): self.weight = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) class MaxPool2d: def __init__(self, kernel_size): self.kernel_size = kernel_size class BaselineCNN: """ Baseline FP32 CNN for comparison with quantized version. This implementation uses standard floating-point arithmetic to establish performance and accuracy baselines. """ def __init__(self, input_channels: int = 3, num_classes: int = 10): """Initialize baseline CNN with FP32 weights.""" self.input_channels = input_channels self.num_classes = num_classes # Initialize FP32 convolutional weights # Conv1: input_channels -> 32, kernel 3x3 self.conv1_weight = np.random.randn(32, input_channels, 3, 3) * 0.02 self.conv1_bias = np.zeros(32) # Conv2: 32 -> 64, kernel 3x3 self.conv2_weight = np.random.randn(64, 32, 3, 3) * 0.02 self.conv2_bias = np.zeros(64) # Pooling (no parameters) self.pool_size = 2 # Fully connected layer (assuming 32x32 input -> 6x6 after convs+pools) self.fc_input_size = 64 * 6 * 6 # 64 channels, 6x6 spatial self.fc = np.random.randn(self.fc_input_size, num_classes) * 0.02 def _count_parameters(self) -> int: """Count total parameters in the model.""" conv1_params = 32 * self.input_channels * 3 * 3 + 32 # weights + bias conv2_params = 64 * 32 * 3 * 3 + 64 fc_params = self.fc_input_size * self.num_classes return conv1_params + conv2_params + fc_params def forward(self, x: np.ndarray) -> np.ndarray: """Forward pass through baseline CNN.""" batch_size = x.shape[0] # Conv1 + ReLU + Pool conv1_out = self._conv2d_forward(x, self.conv1_weight, self.conv1_bias) conv1_relu = np.maximum(0, conv1_out) pool1_out = self._maxpool2d_forward(conv1_relu, self.pool_size) # Conv2 + ReLU + Pool conv2_out = self._conv2d_forward(pool1_out, self.conv2_weight, self.conv2_bias) conv2_relu = np.maximum(0, conv2_out) pool2_out = self._maxpool2d_forward(conv2_relu, self.pool_size) # Flatten flattened = pool2_out.reshape(batch_size, -1) # Fully connected logits = flattened @ self.fc return logits def _conv2d_forward(self, x: np.ndarray, weight: np.ndarray, bias: np.ndarray) -> np.ndarray: """Simple convolution implementation with bias.""" batch, in_ch, in_h, in_w = x.shape out_ch, in_ch, kh, kw = weight.shape out_h = in_h - kh + 1 out_w = in_w - kw + 1 output = np.zeros((batch, out_ch, out_h, out_w)) for b in range(batch): for oc in range(out_ch): for oh in range(out_h): for ow in range(out_w): for ic in range(in_ch): for kh_i in range(kh): for kw_i in range(kw): output[b, oc, oh, ow] += ( x[b, ic, oh + kh_i, ow + kw_i] * weight[oc, ic, kh_i, kw_i] ) # Add bias output[b, oc, oh, ow] += bias[oc] return output def _maxpool2d_forward(self, x: np.ndarray, pool_size: int) -> np.ndarray: """Simple max pooling implementation.""" batch, ch, in_h, in_w = x.shape out_h = in_h // pool_size out_w = in_w // pool_size output = np.zeros((batch, ch, out_h, out_w)) for b in range(batch): for c in range(ch): for oh in range(out_h): for ow in range(out_w): h_start = oh * pool_size w_start = ow * pool_size pool_region = x[b, c, h_start:h_start+pool_size, w_start:w_start+pool_size] output[b, c, oh, ow] = np.max(pool_region) return output def predict(self, x: np.ndarray) -> np.ndarray: """Make predictions with the model.""" logits = self.forward(x) return np.argmax(logits, axis=1) class INT8Quantizer: """ INT8 quantizer for neural network weights and activations. This quantizer converts FP32 tensors to INT8 representation using scale and zero-point parameters for maximum precision. """ def __init__(self): """Initialize the quantizer.""" self.calibration_stats = {} def compute_quantization_params(self, tensor: np.ndarray, symmetric: bool = True) -> Tuple[float, int]: """Compute quantization scale and zero point for a tensor.""" # Find tensor range tensor_min = float(np.min(tensor)) tensor_max = float(np.max(tensor)) if symmetric: # Symmetric quantization: use max absolute value max_abs = max(abs(tensor_min), abs(tensor_max)) tensor_min = -max_abs tensor_max = max_abs zero_point = 0 else: # Asymmetric quantization: use full range zero_point = 0 # We'll compute this below # INT8 range is [-128, 127] = 255 values int8_min = -128 int8_max = 127 int8_range = int8_max - int8_min # Compute scale tensor_range = tensor_max - tensor_min if tensor_range == 0: scale = 1.0 else: scale = tensor_range / int8_range if not symmetric: # Compute zero point for asymmetric quantization zero_point_fp = int8_min - tensor_min / scale zero_point = int(round(np.clip(zero_point_fp, int8_min, int8_max))) return scale, zero_point def quantize_tensor(self, tensor: np.ndarray, scale: float, zero_point: int) -> np.ndarray: """Quantize FP32 tensor to INT8.""" # Apply quantization formula quantized_fp = tensor / scale + zero_point # Round and clip to INT8 range quantized_int = np.round(quantized_fp) quantized_int = np.clip(quantized_int, -128, 127) # Convert to INT8 quantized = quantized_int.astype(np.int8) return quantized def dequantize_tensor(self, quantized_tensor: np.ndarray, scale: float, zero_point: int) -> np.ndarray: """Dequantize INT8 tensor back to FP32.""" # Convert to FP32 and apply dequantization formula fp32_tensor = (quantized_tensor.astype(np.float32) - zero_point) * scale return fp32_tensor def quantize_weights(self, weights: np.ndarray, calibration_data: Optional[List[np.ndarray]] = None) -> Dict[str, Any]: """Quantize neural network weights with optimal parameters.""" # Compute quantization parameters scale, zero_point = self.compute_quantization_params(weights, symmetric=True) # Quantize weights quantized_weights = self.quantize_tensor(weights, scale, zero_point) # Dequantize for error analysis dequantized_weights = self.dequantize_tensor(quantized_weights, scale, zero_point) # Compute quantization error quantization_error = np.mean(np.abs(weights - dequantized_weights)) max_error = np.max(np.abs(weights - dequantized_weights)) # Memory savings original_size = weights.nbytes quantized_size = quantized_weights.nbytes compression_ratio = original_size / quantized_size return { 'quantized_weights': quantized_weights, 'scale': scale, 'zero_point': zero_point, 'quantization_error': quantization_error, 'compression_ratio': compression_ratio, 'original_shape': weights.shape } class QuantizedConv2d: """ Quantized 2D convolution layer using INT8 weights. This layer stores weights in INT8 format and performs optimized integer arithmetic for fast inference. """ def __init__(self, in_channels: int, out_channels: int, kernel_size: int): """Initialize quantized convolution layer.""" self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size # Initialize FP32 weights (will be quantized during calibration) weight_shape = (out_channels, in_channels, kernel_size, kernel_size) self.weight_fp32 = np.random.randn(*weight_shape) * 0.02 self.bias = np.zeros(out_channels) # Quantization parameters (set during quantization) self.weight_quantized = None self.weight_scale = None self.weight_zero_point = None self.is_quantized = False def quantize_weights(self, quantizer: INT8Quantizer): """Quantize the layer weights using the provided quantizer.""" # Quantize weights result = quantizer.quantize_weights(self.weight_fp32) # Store quantized parameters self.weight_quantized = result['quantized_weights'] self.weight_scale = result['scale'] self.weight_zero_point = result['zero_point'] self.is_quantized = True def forward(self, x: np.ndarray) -> np.ndarray: """Forward pass with quantized weights.""" # Choose weights to use if self.is_quantized: # Dequantize weights for computation weights = self.weight_scale * (self.weight_quantized.astype(np.float32) - self.weight_zero_point) else: weights = self.weight_fp32 # Perform convolution (same as baseline) batch, in_ch, in_h, in_w = x.shape out_ch, in_ch, kh, kw = weights.shape out_h = in_h - kh + 1 out_w = in_w - kw + 1 output = np.zeros((batch, out_ch, out_h, out_w)) for b in range(batch): for oc in range(out_ch): for oh in range(out_h): for ow in range(out_w): for ic in range(in_ch): for kh_i in range(kh): for kw_i in range(kw): output[b, oc, oh, ow] += ( x[b, ic, oh + kh_i, ow + kw_i] * weights[oc, ic, kh_i, kw_i] ) # Add bias output[b, oc, oh, ow] += self.bias[oc] return output class QuantizedCNN: """ CNN with INT8 quantized weights for fast inference. This model demonstrates how quantization can achieve 4× speedup with minimal accuracy loss through precision optimization. """ def __init__(self, input_channels: int = 3, num_classes: int = 10): """Initialize quantized CNN.""" self.input_channels = input_channels self.num_classes = num_classes # Quantized convolutional layers self.conv1 = QuantizedConv2d(input_channels, 32, kernel_size=3) self.conv2 = QuantizedConv2d(32, 64, kernel_size=3) # Pooling (unchanged) - we'll implement our own pooling self.pool_size = 2 # Fully connected (kept as FP32 for simplicity) self.fc_input_size = 64 * 6 * 6 self.fc = np.random.randn(self.fc_input_size, num_classes) * 0.02 # Quantizer self.quantizer = INT8Quantizer() self.is_quantized = False def _count_parameters(self) -> int: """Count total parameters in the model.""" conv1_params = 32 * self.input_channels * 3 * 3 + 32 conv2_params = 64 * 32 * 3 * 3 + 64 fc_params = self.fc_input_size * self.num_classes return conv1_params + conv2_params + fc_params def calibrate_and_quantize(self, calibration_data: List[np.ndarray]): """Calibrate quantization parameters using representative data.""" # Quantize convolutional layers self.conv1.quantize_weights(self.quantizer) self.conv2.quantize_weights(self.quantizer) # Mark as quantized self.is_quantized = True def forward(self, x: np.ndarray) -> np.ndarray: """Forward pass through quantized CNN.""" batch_size = x.shape[0] # Conv1 + ReLU + Pool (quantized) conv1_out = self.conv1.forward(x) conv1_relu = np.maximum(0, conv1_out) pool1_out = self._maxpool2d_forward(conv1_relu, self.pool_size) # Conv2 + ReLU + Pool (quantized) conv2_out = self.conv2.forward(pool1_out) conv2_relu = np.maximum(0, conv2_out) pool2_out = self._maxpool2d_forward(conv2_relu, self.pool_size) # Flatten and FC flattened = pool2_out.reshape(batch_size, -1) logits = flattened @ self.fc return logits def _maxpool2d_forward(self, x: np.ndarray, pool_size: int) -> np.ndarray: """Simple max pooling implementation.""" batch, ch, in_h, in_w = x.shape out_h = in_h // pool_size out_w = in_w // pool_size output = np.zeros((batch, ch, out_h, out_w)) for b in range(batch): for c in range(ch): for oh in range(out_h): for ow in range(out_w): h_start = oh * pool_size w_start = ow * pool_size pool_region = x[b, c, h_start:h_start+pool_size, w_start:w_start+pool_size] output[b, c, oh, ow] = np.max(pool_region) return output def predict(self, x: np.ndarray) -> np.ndarray: """Make predictions with the quantized model.""" logits = self.forward(x) return np.argmax(logits, axis=1) class QuantizationPerformanceAnalyzer: """ Analyze the performance benefits of INT8 quantization. This analyzer measures memory usage, inference speed, and accuracy to demonstrate the quantization trade-offs. """ def __init__(self): """Initialize the performance analyzer.""" self.results = {} def benchmark_models(self, baseline_model: BaselineCNN, quantized_model: QuantizedCNN, test_data: np.ndarray, num_runs: int = 10) -> Dict[str, Any]: """Comprehensive benchmark of baseline vs quantized models.""" batch_size = test_data.shape[0] # Memory Analysis baseline_memory = self._calculate_memory_usage(baseline_model) quantized_memory = self._calculate_memory_usage(quantized_model) memory_reduction = baseline_memory / quantized_memory # Inference Speed Benchmark # Baseline timing baseline_times = [] for run in range(num_runs): start_time = time.time() baseline_output = baseline_model.forward(test_data) run_time = time.time() - start_time baseline_times.append(run_time) baseline_avg_time = np.mean(baseline_times) # Quantized timing quantized_times = [] for run in range(num_runs): start_time = time.time() quantized_output = quantized_model.forward(test_data) run_time = time.time() - start_time quantized_times.append(run_time) quantized_avg_time = np.mean(quantized_times) # Calculate speedup speedup = baseline_avg_time / quantized_avg_time # Accuracy Analysis output_diff = np.mean(np.abs(baseline_output - quantized_output)) # Prediction agreement baseline_preds = np.argmax(baseline_output, axis=1) quantized_preds = np.argmax(quantized_output, axis=1) agreement = np.mean(baseline_preds == quantized_preds) # Store results results = { 'memory_baseline_kb': baseline_memory, 'memory_quantized_kb': quantized_memory, 'memory_reduction': memory_reduction, 'speed_baseline_ms': baseline_avg_time * 1000, 'speed_quantized_ms': quantized_avg_time * 1000, 'speedup': speedup, 'output_difference': output_diff, 'prediction_agreement': agreement, 'batch_size': batch_size } self.results = results return results def _calculate_memory_usage(self, model) -> float: """Calculate model memory usage in KB.""" total_memory = 0 if hasattr(model, 'conv1'): if hasattr(model.conv1, 'weight_quantized') and model.conv1.is_quantized: total_memory += model.conv1.weight_quantized.nbytes else: total_memory += model.conv1.weight.nbytes if hasattr(model.conv1, 'weight') else 0 if hasattr(model, 'conv1') and hasattr(model.conv1, 'weight_fp32'): total_memory += model.conv1.weight_fp32.nbytes if hasattr(model, 'conv2'): if hasattr(model.conv2, 'weight_quantized') and model.conv2.is_quantized: total_memory += model.conv2.weight_quantized.nbytes else: total_memory += model.conv2.weight.nbytes if hasattr(model.conv2, 'weight') else 0 if hasattr(model, 'conv2') and hasattr(model.conv2, 'weight_fp32'): total_memory += model.conv2.weight_fp32.nbytes if hasattr(model, 'fc'): total_memory += model.fc.nbytes return total_memory / 1024 # Convert to KB class QuantizationSystemsAnalyzer: """ Analyze the systems engineering trade-offs in quantization. This analyzer helps understand the precision vs performance principles behind the speedups achieved by INT8 quantization. """ def __init__(self): """Initialize the systems analyzer.""" pass def analyze_precision_tradeoffs(self, bit_widths: List[int] = [32, 16, 8, 4]) -> Dict[str, Any]: """Analyze precision vs performance trade-offs across bit widths.""" results = { 'bit_widths': bit_widths, 'memory_per_param': [], 'compute_efficiency': [], 'typical_accuracy_loss': [], 'hardware_support': [], 'use_cases': [] } # Analyze each bit width for bits in bit_widths: # Memory usage (bytes per parameter) memory = bits / 8 results['memory_per_param'].append(memory) # Compute efficiency (relative to FP32) if bits == 32: efficiency = 1.0 # FP32 baseline elif bits == 16: efficiency = 1.5 # FP16 is faster but not dramatically elif bits == 8: efficiency = 4.0 # INT8 has specialized hardware support elif bits == 4: efficiency = 8.0 # Very fast but limited hardware support else: efficiency = 32.0 / bits # Rough approximation results['compute_efficiency'].append(efficiency) # Typical accuracy loss (percentage points) if bits == 32: acc_loss = 0.0 # No loss elif bits == 16: acc_loss = 0.1 # Minimal loss elif bits == 8: acc_loss = 0.5 # Small loss elif bits == 4: acc_loss = 2.0 # Noticeable loss else: acc_loss = min(10.0, 32.0 / bits) # Higher loss for lower precision results['typical_accuracy_loss'].append(acc_loss) # Hardware support assessment if bits == 32: hw_support = "Universal" elif bits == 16: hw_support = "Modern GPUs, TPUs" elif bits == 8: hw_support = "CPUs, Mobile, Edge" elif bits == 4: hw_support = "Specialized chips" else: hw_support = "Research only" results['hardware_support'].append(hw_support) # Optimal use cases if bits == 32: use_case = "Training, high-precision inference" elif bits == 16: use_case = "Large model inference, mixed precision training" elif bits == 8: use_case = "Mobile deployment, edge inference, production CNNs" elif bits == 4: use_case = "Extreme compression, research applications" else: use_case = "Experimental" results['use_cases'].append(use_case) return results class QuantizationMemoryProfiler: """ Memory profiler for analyzing quantization memory usage and complexity. This profiler demonstrates the systems engineering aspects of quantization by measuring actual memory consumption and computational complexity. """ def __init__(self): """Initialize the memory profiler.""" pass def profile_memory_usage(self, baseline_model: BaselineCNN, quantized_model: QuantizedCNN) -> Dict[str, Any]: """Profile detailed memory usage of baseline vs quantized models.""" # Baseline model memory breakdown baseline_conv1_mem = baseline_model.conv1_weight.nbytes + baseline_model.conv1_bias.nbytes baseline_conv2_mem = baseline_model.conv2_weight.nbytes + baseline_model.conv2_bias.nbytes baseline_fc_mem = baseline_model.fc.nbytes baseline_total = baseline_conv1_mem + baseline_conv2_mem + baseline_fc_mem # Quantized model memory breakdown quant_conv1_mem = quantized_model.conv1.weight_quantized.nbytes if quantized_model.conv1.is_quantized else baseline_conv1_mem quant_conv2_mem = quantized_model.conv2.weight_quantized.nbytes if quantized_model.conv2.is_quantized else baseline_conv2_mem quant_fc_mem = quantized_model.fc.nbytes # FC kept as FP32 quant_total = quant_conv1_mem + quant_conv2_mem + quant_fc_mem # Memory savings analysis conv_savings = (baseline_conv1_mem + baseline_conv2_mem) / (quant_conv1_mem + quant_conv2_mem) total_savings = baseline_total / quant_total return { 'baseline_total_kb': baseline_total // 1024, 'quantized_total_kb': quant_total // 1024, 'conv_compression': conv_savings, 'total_compression': total_savings, 'memory_saved_kb': (baseline_total - quant_total) // 1024 } class ProductionQuantizationInsights: """ Insights into how production ML systems use quantization. This class is PROVIDED to show real-world applications of the quantization techniques you've implemented. """ @staticmethod def explain_production_patterns(): """Explain how production systems use quantization.""" patterns = [ { 'system': 'TensorFlow Lite (Google)', 'technique': 'Post-training INT8 quantization with calibration', 'benefit': 'Enables ML on mobile devices and edge hardware', 'challenge': 'Maintaining accuracy across diverse model architectures' }, { 'system': 'PyTorch Mobile (Meta)', 'technique': 'Dynamic quantization with runtime calibration', 'benefit': 'Reduces model size by 4× for mobile deployment', 'challenge': 'Balancing quantization overhead vs inference speedup' }, { 'system': 'ONNX Runtime (Microsoft)', 'technique': 'Mixed precision with selective layer quantization', 'benefit': 'Optimizes critical layers while preserving accuracy', 'challenge': 'Automated selection of quantization strategies' }, { 'system': 'Apple Core ML', 'technique': 'INT8 quantization with hardware acceleration', 'benefit': 'Leverages Neural Engine for ultra-fast inference', 'challenge': 'Platform-specific optimization for different iOS devices' } ] return patterns @staticmethod def explain_advanced_techniques(): """Explain advanced quantization techniques.""" techniques = [ "Mixed Precision: Quantize some layers to INT8, keep critical layers in FP32", "Dynamic Quantization: Quantize weights statically, activations dynamically", "Block-wise Quantization: Different quantization parameters for weight blocks", "Quantization-Aware Training: Train model to be robust to quantization", "Channel-wise Quantization: Separate scales for each output channel", "Adaptive Quantization: Adjust precision based on layer importance", "Hardware-Aware Quantization: Optimize for specific hardware capabilities", "Calibration-Free Quantization: Use statistical methods without data" ] return techniques