Files
TinyTorch/tinytorch/optimization/quantization.py
Vijay Janapa Reddi d3a126235c Restructure: Separate developer source (src/) from learner notebooks (modules/)
Major directory restructure to support both developer and learner workflows:

Structure Changes:
- NEW: src/ directory for Python source files (version controlled)
  - Files renamed: tensor.py → 01_tensor.py (matches directory naming)
  - All 20 modules moved from modules/ to src/
- CHANGED: modules/ now holds generated notebooks (gitignored)
  - Generated from src/*.py using jupytext
  - Learners work in notebooks, developers work in Python source
- UNCHANGED: tinytorch/ package (still auto-generated from notebooks)

Workflow: src/*.py → modules/*.ipynb → tinytorch/*.py

Command Updates:
- Updated export command to read from src/ and generate to modules/
- Export flow: discovers modules in src/, converts to notebooks in modules/, exports to tinytorch/
- All 20 modules tested and working

Configuration:
- Updated .gitignore to ignore modules/ directory
- Updated README.md with new three-layer architecture explanation
- Updated export.py source mappings and paths

Benefits:
- Clean separation: developers edit Python, learners use notebooks
- Better version control: only Python source committed, notebooks generated
- Flexible learning: can work in notebooks OR Python source
- Maintains backward compatibility: tinytorch package unchanged

Tested:
- Single module export: tito export 01_tensor 
- All modules export: tito export --all 
- Package imports: from tinytorch.core.tensor import Tensor 
- 20/20 modules successfully converted and exported
2025-11-25 00:02:21 -05:00

294 lines
12 KiB
Python
Generated
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: src/XX_quantization/XX_quantization.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['INT8_MIN_VALUE', 'INT8_MAX_VALUE', 'INT8_RANGE', 'EPSILON', 'BYTES_PER_FLOAT32', 'BYTES_PER_INT8', 'MB_TO_BYTES',
'SimpleModel', 'QuantizedLinear', 'QuantizationComplete', 'quantize_int8', 'dequantize_int8',
'quantize_model']
# %% ../../modules/15_quantization/15_quantization.ipynb 3
import numpy as np
import time
from typing import Tuple, Dict, List, Optional
import warnings
# Import dependencies from other modules
from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.activations import ReLU
# Constants for INT8 quantization
INT8_MIN_VALUE = -128
INT8_MAX_VALUE = 127
INT8_RANGE = 256 # Number of possible INT8 values (from -128 to 127 inclusive)
EPSILON = 1e-8 # Small value for numerical stability (constant tensor detection)
# Constants for memory calculations
BYTES_PER_FLOAT32 = 4 # Standard float32 size in bytes
BYTES_PER_INT8 = 1 # INT8 size in bytes
MB_TO_BYTES = 1024 * 1024 # Megabytes to bytes conversion
# SimpleModel helper for testing (TinyTorch doesn't use Sequential)
class SimpleModel:
"""Simple model container for testing - demonstrates explicit composition."""
def __init__(self, *layers):
self.layers = list(layers)
def forward(self, x):
for layer in self.layers:
x = layer.forward(x)
return x
if __name__ == "__main__":
print("✅ Quantization module imports complete")
# %% ../../modules/15_quantization/15_quantization.ipynb 17
class QuantizedLinear:
"""Quantized version of Linear layer using INT8 arithmetic."""
def __init__(self, linear_layer: Linear):
"""
Create quantized version of existing linear layer.
TODO: Quantize weights and bias, store quantization parameters
APPROACH:
1. Quantize weights using quantize_int8
2. Quantize bias if it exists
3. Store original layer reference for forward pass
4. Store quantization parameters for dequantization
IMPLEMENTATION STRATEGY:
- Store quantized weights, scales, and zero points
- Implement forward pass using dequantized computation (educational approach)
- Production: Would use INT8 matrix multiplication libraries
"""
### BEGIN SOLUTION
self.original_layer = linear_layer
# Quantize weights
self.q_weight, self.weight_scale, self.weight_zero_point = quantize_int8(linear_layer.weight)
# Quantize bias if it exists
if linear_layer.bias is not None:
self.q_bias, self.bias_scale, self.bias_zero_point = quantize_int8(linear_layer.bias)
else:
self.q_bias = None
self.bias_scale = None
self.bias_zero_point = None
# Store input quantization parameters (set during calibration)
self.input_scale = None
self.input_zero_point = None
### END SOLUTION
def calibrate(self, sample_inputs: List[Tensor]):
"""
Calibrate input quantization parameters using sample data.
TODO: Calculate optimal input quantization parameters
APPROACH:
1. Collect statistics from sample inputs
2. Calculate optimal scale and zero_point for inputs
3. Store for use in forward pass
"""
### BEGIN SOLUTION
# Collect all input values
all_values = []
for inp in sample_inputs:
all_values.extend(inp.data.flatten())
all_values = np.array(all_values)
# Calculate input quantization parameters
min_val = float(np.min(all_values))
max_val = float(np.max(all_values))
if abs(max_val - min_val) < EPSILON:
self.input_scale = 1.0
self.input_zero_point = 0
else:
self.input_scale = (max_val - min_val) / (INT8_RANGE - 1)
self.input_zero_point = int(np.round(INT8_MIN_VALUE - min_val / self.input_scale))
self.input_zero_point = np.clip(self.input_zero_point, INT8_MIN_VALUE, INT8_MAX_VALUE)
### END SOLUTION
def forward(self, x: Tensor) -> Tensor:
"""
Forward pass with quantized computation.
TODO: Implement quantized forward pass
APPROACH:
1. Quantize input (if calibrated)
2. Dequantize weights and input for computation (educational approach)
3. Perform matrix multiplication
4. Return FP32 result
NOTE: Production quantization uses INT8 GEMM libraries for speed
"""
### BEGIN SOLUTION
# For educational purposes, we dequantize and compute in FP32
# Production systems use specialized INT8 GEMM operations
# Dequantize weights
weight_fp32 = dequantize_int8(self.q_weight, self.weight_scale, self.weight_zero_point)
# Perform computation (same as original layer)
result = x.matmul(weight_fp32)
# Add bias if it exists
if self.q_bias is not None:
bias_fp32 = dequantize_int8(self.q_bias, self.bias_scale, self.bias_zero_point)
result = Tensor(result.data + bias_fp32.data)
return result
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the quantized linear layer to be called like a function."""
return self.forward(x)
def parameters(self) -> List[Tensor]:
"""Return quantized parameters."""
params = [self.q_weight]
if self.q_bias is not None:
params.append(self.q_bias)
return params
def memory_usage(self) -> Dict[str, float]:
"""Calculate memory usage in bytes."""
### BEGIN SOLUTION
# Original FP32 usage
original_weight_bytes = self.original_layer.weight.data.size * BYTES_PER_FLOAT32
original_bias_bytes = 0
if self.original_layer.bias is not None:
original_bias_bytes = self.original_layer.bias.data.size * BYTES_PER_FLOAT32
# Quantized INT8 usage
quantized_weight_bytes = self.q_weight.data.size * BYTES_PER_INT8
quantized_bias_bytes = 0
if self.q_bias is not None:
quantized_bias_bytes = self.q_bias.data.size * BYTES_PER_INT8
# Add overhead for scales and zero points (small)
# 2 floats: one scale for weights, one scale for bias (if present)
overhead_bytes = BYTES_PER_FLOAT32 * 2
quantized_total = quantized_weight_bytes + quantized_bias_bytes + overhead_bytes
original_total = original_weight_bytes + original_bias_bytes
return {
'original_bytes': original_total,
'quantized_bytes': quantized_total,
'compression_ratio': original_total / quantized_total if quantized_total > 0 else 1.0
}
### END SOLUTION
# %% ../../modules/15_quantization/15_quantization.ipynb 36
class QuantizationComplete:
"""
Complete quantization system for milestone use.
Provides INT8 quantization with calibration for 4× memory reduction.
"""
@staticmethod
def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]:
"""Quantize FP32 tensor to INT8."""
data = tensor.data
min_val = float(np.min(data))
max_val = float(np.max(data))
if abs(max_val - min_val) < EPSILON:
return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0
scale = (max_val - min_val) / (INT8_RANGE - 1)
zero_point = int(np.round(INT8_MIN_VALUE - min_val / scale))
zero_point = int(np.clip(zero_point, INT8_MIN_VALUE, INT8_MAX_VALUE))
quantized_data = np.round(data / scale + zero_point)
quantized_data = np.clip(quantized_data, INT8_MIN_VALUE, INT8_MAX_VALUE).astype(np.int8)
return Tensor(quantized_data), scale, zero_point
@staticmethod
def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
"""Dequantize INT8 tensor back to FP32."""
dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale
return Tensor(dequantized_data)
@staticmethod
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
"""
Quantize all Linear layers in a model.
Returns dictionary with quantization info and memory savings.
"""
quantized_layers = {}
original_size = 0
quantized_size = 0
# Iterate through model parameters
# SimpleModel has .layers, each layer has .parameters() method
param_idx = 0
for layer in model.layers:
for param in layer.parameters():
param_size = param.data.nbytes
original_size += param_size
# Quantize parameter
q_param, scale, zp = QuantizationComplete.quantize_tensor(param)
quantized_size += q_param.data.nbytes
quantized_layers[f'param_{param_idx}'] = {
'quantized': q_param,
'scale': scale,
'zero_point': zp,
'original_shape': param.data.shape
}
param_idx += 1
return {
'quantized_layers': quantized_layers,
'original_size_mb': original_size / MB_TO_BYTES,
'quantized_size_mb': quantized_size / MB_TO_BYTES,
'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0
}
@staticmethod
def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]:
"""Compare memory usage between original and quantized models."""
return {
'original_mb': quantized_info['original_size_mb'],
'quantized_mb': quantized_info['quantized_size_mb'],
'compression_ratio': quantized_info['compression_ratio'],
'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb']
}
# Convenience functions for backward compatibility
def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]:
"""Quantize FP32 tensor to INT8."""
return QuantizationComplete.quantize_tensor(tensor)
def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
"""Dequantize INT8 tensor back to FP32."""
return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point)
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
"""Quantize entire model to INT8."""
return QuantizationComplete.quantize_model(model, calibration_data)