mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-19 07:49:41 -05:00
Major directory restructure to support both developer and learner workflows: Structure Changes: - NEW: src/ directory for Python source files (version controlled) - Files renamed: tensor.py → 01_tensor.py (matches directory naming) - All 20 modules moved from modules/ to src/ - CHANGED: modules/ now holds generated notebooks (gitignored) - Generated from src/*.py using jupytext - Learners work in notebooks, developers work in Python source - UNCHANGED: tinytorch/ package (still auto-generated from notebooks) Workflow: src/*.py → modules/*.ipynb → tinytorch/*.py Command Updates: - Updated export command to read from src/ and generate to modules/ - Export flow: discovers modules in src/, converts to notebooks in modules/, exports to tinytorch/ - All 20 modules tested and working Configuration: - Updated .gitignore to ignore modules/ directory - Updated README.md with new three-layer architecture explanation - Updated export.py source mappings and paths Benefits: - Clean separation: developers edit Python, learners use notebooks - Better version control: only Python source committed, notebooks generated - Flexible learning: can work in notebooks OR Python source - Maintains backward compatibility: tinytorch package unchanged Tested: - Single module export: tito export 01_tensor ✅ - All modules export: tito export --all ✅ - Package imports: from tinytorch.core.tensor import Tensor ✅ - 20/20 modules successfully converted and exported
294 lines
12 KiB
Python
Generated
294 lines
12 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||
# ║ ║
|
||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||
# ║ ║
|
||
# ║ ✅ TO EDIT: src/XX_quantization/XX_quantization.py ║
|
||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||
# ║ ║
|
||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||
# ║ Editing it directly may break module functionality and training. ║
|
||
# ║ ║
|
||
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
|
||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||
# %% auto 0
|
||
__all__ = ['INT8_MIN_VALUE', 'INT8_MAX_VALUE', 'INT8_RANGE', 'EPSILON', 'BYTES_PER_FLOAT32', 'BYTES_PER_INT8', 'MB_TO_BYTES',
|
||
'SimpleModel', 'QuantizedLinear', 'QuantizationComplete', 'quantize_int8', 'dequantize_int8',
|
||
'quantize_model']
|
||
|
||
# %% ../../modules/15_quantization/15_quantization.ipynb 3
|
||
import numpy as np
|
||
import time
|
||
from typing import Tuple, Dict, List, Optional
|
||
import warnings
|
||
|
||
# Import dependencies from other modules
|
||
from ..core.tensor import Tensor
|
||
from ..core.layers import Linear
|
||
from ..core.activations import ReLU
|
||
|
||
# Constants for INT8 quantization
|
||
INT8_MIN_VALUE = -128
|
||
INT8_MAX_VALUE = 127
|
||
INT8_RANGE = 256 # Number of possible INT8 values (from -128 to 127 inclusive)
|
||
EPSILON = 1e-8 # Small value for numerical stability (constant tensor detection)
|
||
|
||
# Constants for memory calculations
|
||
BYTES_PER_FLOAT32 = 4 # Standard float32 size in bytes
|
||
BYTES_PER_INT8 = 1 # INT8 size in bytes
|
||
MB_TO_BYTES = 1024 * 1024 # Megabytes to bytes conversion
|
||
|
||
# SimpleModel helper for testing (TinyTorch doesn't use Sequential)
|
||
class SimpleModel:
|
||
"""Simple model container for testing - demonstrates explicit composition."""
|
||
def __init__(self, *layers):
|
||
self.layers = list(layers)
|
||
def forward(self, x):
|
||
for layer in self.layers:
|
||
x = layer.forward(x)
|
||
return x
|
||
|
||
if __name__ == "__main__":
|
||
print("✅ Quantization module imports complete")
|
||
|
||
# %% ../../modules/15_quantization/15_quantization.ipynb 17
|
||
class QuantizedLinear:
|
||
"""Quantized version of Linear layer using INT8 arithmetic."""
|
||
|
||
def __init__(self, linear_layer: Linear):
|
||
"""
|
||
Create quantized version of existing linear layer.
|
||
|
||
TODO: Quantize weights and bias, store quantization parameters
|
||
|
||
APPROACH:
|
||
1. Quantize weights using quantize_int8
|
||
2. Quantize bias if it exists
|
||
3. Store original layer reference for forward pass
|
||
4. Store quantization parameters for dequantization
|
||
|
||
IMPLEMENTATION STRATEGY:
|
||
- Store quantized weights, scales, and zero points
|
||
- Implement forward pass using dequantized computation (educational approach)
|
||
- Production: Would use INT8 matrix multiplication libraries
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.original_layer = linear_layer
|
||
|
||
# Quantize weights
|
||
self.q_weight, self.weight_scale, self.weight_zero_point = quantize_int8(linear_layer.weight)
|
||
|
||
# Quantize bias if it exists
|
||
if linear_layer.bias is not None:
|
||
self.q_bias, self.bias_scale, self.bias_zero_point = quantize_int8(linear_layer.bias)
|
||
else:
|
||
self.q_bias = None
|
||
self.bias_scale = None
|
||
self.bias_zero_point = None
|
||
|
||
# Store input quantization parameters (set during calibration)
|
||
self.input_scale = None
|
||
self.input_zero_point = None
|
||
### END SOLUTION
|
||
|
||
def calibrate(self, sample_inputs: List[Tensor]):
|
||
"""
|
||
Calibrate input quantization parameters using sample data.
|
||
|
||
TODO: Calculate optimal input quantization parameters
|
||
|
||
APPROACH:
|
||
1. Collect statistics from sample inputs
|
||
2. Calculate optimal scale and zero_point for inputs
|
||
3. Store for use in forward pass
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Collect all input values
|
||
all_values = []
|
||
for inp in sample_inputs:
|
||
all_values.extend(inp.data.flatten())
|
||
|
||
all_values = np.array(all_values)
|
||
|
||
# Calculate input quantization parameters
|
||
min_val = float(np.min(all_values))
|
||
max_val = float(np.max(all_values))
|
||
|
||
if abs(max_val - min_val) < EPSILON:
|
||
self.input_scale = 1.0
|
||
self.input_zero_point = 0
|
||
else:
|
||
self.input_scale = (max_val - min_val) / (INT8_RANGE - 1)
|
||
self.input_zero_point = int(np.round(INT8_MIN_VALUE - min_val / self.input_scale))
|
||
self.input_zero_point = np.clip(self.input_zero_point, INT8_MIN_VALUE, INT8_MAX_VALUE)
|
||
### END SOLUTION
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Forward pass with quantized computation.
|
||
|
||
TODO: Implement quantized forward pass
|
||
|
||
APPROACH:
|
||
1. Quantize input (if calibrated)
|
||
2. Dequantize weights and input for computation (educational approach)
|
||
3. Perform matrix multiplication
|
||
4. Return FP32 result
|
||
|
||
NOTE: Production quantization uses INT8 GEMM libraries for speed
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# For educational purposes, we dequantize and compute in FP32
|
||
# Production systems use specialized INT8 GEMM operations
|
||
|
||
# Dequantize weights
|
||
weight_fp32 = dequantize_int8(self.q_weight, self.weight_scale, self.weight_zero_point)
|
||
|
||
# Perform computation (same as original layer)
|
||
result = x.matmul(weight_fp32)
|
||
|
||
# Add bias if it exists
|
||
if self.q_bias is not None:
|
||
bias_fp32 = dequantize_int8(self.q_bias, self.bias_scale, self.bias_zero_point)
|
||
result = Tensor(result.data + bias_fp32.data)
|
||
|
||
return result
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the quantized linear layer to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def parameters(self) -> List[Tensor]:
|
||
"""Return quantized parameters."""
|
||
params = [self.q_weight]
|
||
if self.q_bias is not None:
|
||
params.append(self.q_bias)
|
||
return params
|
||
|
||
def memory_usage(self) -> Dict[str, float]:
|
||
"""Calculate memory usage in bytes."""
|
||
### BEGIN SOLUTION
|
||
# Original FP32 usage
|
||
original_weight_bytes = self.original_layer.weight.data.size * BYTES_PER_FLOAT32
|
||
original_bias_bytes = 0
|
||
if self.original_layer.bias is not None:
|
||
original_bias_bytes = self.original_layer.bias.data.size * BYTES_PER_FLOAT32
|
||
|
||
# Quantized INT8 usage
|
||
quantized_weight_bytes = self.q_weight.data.size * BYTES_PER_INT8
|
||
quantized_bias_bytes = 0
|
||
if self.q_bias is not None:
|
||
quantized_bias_bytes = self.q_bias.data.size * BYTES_PER_INT8
|
||
|
||
# Add overhead for scales and zero points (small)
|
||
# 2 floats: one scale for weights, one scale for bias (if present)
|
||
overhead_bytes = BYTES_PER_FLOAT32 * 2
|
||
|
||
quantized_total = quantized_weight_bytes + quantized_bias_bytes + overhead_bytes
|
||
original_total = original_weight_bytes + original_bias_bytes
|
||
|
||
return {
|
||
'original_bytes': original_total,
|
||
'quantized_bytes': quantized_total,
|
||
'compression_ratio': original_total / quantized_total if quantized_total > 0 else 1.0
|
||
}
|
||
### END SOLUTION
|
||
|
||
# %% ../../modules/15_quantization/15_quantization.ipynb 36
|
||
class QuantizationComplete:
|
||
"""
|
||
Complete quantization system for milestone use.
|
||
|
||
Provides INT8 quantization with calibration for 4× memory reduction.
|
||
"""
|
||
|
||
@staticmethod
|
||
def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]:
|
||
"""Quantize FP32 tensor to INT8."""
|
||
data = tensor.data
|
||
min_val = float(np.min(data))
|
||
max_val = float(np.max(data))
|
||
|
||
if abs(max_val - min_val) < EPSILON:
|
||
return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0
|
||
|
||
scale = (max_val - min_val) / (INT8_RANGE - 1)
|
||
zero_point = int(np.round(INT8_MIN_VALUE - min_val / scale))
|
||
zero_point = int(np.clip(zero_point, INT8_MIN_VALUE, INT8_MAX_VALUE))
|
||
|
||
quantized_data = np.round(data / scale + zero_point)
|
||
quantized_data = np.clip(quantized_data, INT8_MIN_VALUE, INT8_MAX_VALUE).astype(np.int8)
|
||
|
||
return Tensor(quantized_data), scale, zero_point
|
||
|
||
@staticmethod
|
||
def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
|
||
"""Dequantize INT8 tensor back to FP32."""
|
||
dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale
|
||
return Tensor(dequantized_data)
|
||
|
||
@staticmethod
|
||
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
|
||
"""
|
||
Quantize all Linear layers in a model.
|
||
|
||
Returns dictionary with quantization info and memory savings.
|
||
"""
|
||
quantized_layers = {}
|
||
original_size = 0
|
||
quantized_size = 0
|
||
|
||
# Iterate through model parameters
|
||
# SimpleModel has .layers, each layer has .parameters() method
|
||
param_idx = 0
|
||
for layer in model.layers:
|
||
for param in layer.parameters():
|
||
param_size = param.data.nbytes
|
||
original_size += param_size
|
||
|
||
# Quantize parameter
|
||
q_param, scale, zp = QuantizationComplete.quantize_tensor(param)
|
||
quantized_size += q_param.data.nbytes
|
||
|
||
quantized_layers[f'param_{param_idx}'] = {
|
||
'quantized': q_param,
|
||
'scale': scale,
|
||
'zero_point': zp,
|
||
'original_shape': param.data.shape
|
||
}
|
||
param_idx += 1
|
||
|
||
return {
|
||
'quantized_layers': quantized_layers,
|
||
'original_size_mb': original_size / MB_TO_BYTES,
|
||
'quantized_size_mb': quantized_size / MB_TO_BYTES,
|
||
'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0
|
||
}
|
||
|
||
@staticmethod
|
||
def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]:
|
||
"""Compare memory usage between original and quantized models."""
|
||
return {
|
||
'original_mb': quantized_info['original_size_mb'],
|
||
'quantized_mb': quantized_info['quantized_size_mb'],
|
||
'compression_ratio': quantized_info['compression_ratio'],
|
||
'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb']
|
||
}
|
||
|
||
# Convenience functions for backward compatibility
|
||
def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]:
|
||
"""Quantize FP32 tensor to INT8."""
|
||
return QuantizationComplete.quantize_tensor(tensor)
|
||
|
||
def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
|
||
"""Dequantize INT8 tensor back to FP32."""
|
||
return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point)
|
||
|
||
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
|
||
"""Quantize entire model to INT8."""
|
||
return QuantizationComplete.quantize_model(model, calibration_data)
|