Files
TinyTorch/tinytorch/optimization/quantization.py
Vijay Janapa Reddi 96880b3133 Update tinytorch and tito with module exports
Re-exported all modules after restructuring:
- Updated _modidx.py with new module locations
- Removed outdated autogeneration headers
- Updated all core modules (tensor, autograd, layers, etc.)
- Updated optimization modules (quantization, compression, etc.)
- Updated TITO commands for new structure

Changes include:
- 24 tinytorch/ module files
- 24 tito/ command and core files
- Updated references from modules/source/ to modules/

All modules re-exported via nbdev from their new locations.
2025-11-10 19:42:03 -05:00

109 lines
4.2 KiB
Python
Generated
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/16_quantization/quantization_dev.ipynb.
# %% auto 0
__all__ = ['QuantizationComplete', 'quantize_int8', 'dequantize_int8', 'quantize_model']
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 3
import numpy as np
import time
from typing import Tuple, Dict, List, Optional
import warnings
# Import dependencies from other modules
from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.activations import ReLU
print("✅ Quantization module imports complete")
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 34
class QuantizationComplete:
"""
Complete quantization system for milestone use.
Provides INT8 quantization with calibration for 4× memory reduction.
"""
@staticmethod
def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]:
"""Quantize FP32 tensor to INT8."""
data = tensor.data
min_val = float(np.min(data))
max_val = float(np.max(data))
if abs(max_val - min_val) < 1e-8:
return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0
scale = (max_val - min_val) / 255.0
zero_point = int(np.round(-128 - min_val / scale))
zero_point = int(np.clip(zero_point, -128, 127))
quantized_data = np.round(data / scale + zero_point)
quantized_data = np.clip(quantized_data, -128, 127).astype(np.int8)
return Tensor(quantized_data), scale, zero_point
@staticmethod
def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
"""Dequantize INT8 tensor back to FP32."""
dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale
return Tensor(dequantized_data)
@staticmethod
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
"""
Quantize all Linear layers in a model.
Returns dictionary with quantization info and memory savings.
"""
quantized_layers = {}
original_size = 0
quantized_size = 0
# Iterate through model parameters
if hasattr(model, 'parameters'):
for i, param in enumerate(model.parameters()):
param_size = param.data.nbytes
original_size += param_size
# Quantize parameter
q_param, scale, zp = QuantizationComplete.quantize_tensor(param)
quantized_size += q_param.data.nbytes
quantized_layers[f'param_{i}'] = {
'quantized': q_param,
'scale': scale,
'zero_point': zp,
'original_shape': param.data.shape
}
return {
'quantized_layers': quantized_layers,
'original_size_mb': original_size / (1024 * 1024),
'quantized_size_mb': quantized_size / (1024 * 1024),
'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0
}
@staticmethod
def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]:
"""Compare memory usage between original and quantized models."""
return {
'original_mb': quantized_info['original_size_mb'],
'quantized_mb': quantized_info['quantized_size_mb'],
'compression_ratio': quantized_info['compression_ratio'],
'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb']
}
# Convenience functions for backward compatibility
def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]:
"""Quantize FP32 tensor to INT8."""
return QuantizationComplete.quantize_tensor(tensor)
def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
"""Dequantize INT8 tensor back to FP32."""
return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point)
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
"""Quantize entire model to INT8."""
return QuantizationComplete.quantize_model(model, calibration_data)