mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-08 00:28:22 -05:00
WHAT: Added Tensor.__getitem__ (slicing) following progressive disclosure principles MODULE 01 (Tensor): - Added __getitem__ method for basic slicing operations - Clean implementation with NO gradient mentions (progressive disclosure) - Supports all NumPy-style indexing: x[0], x[:3], x[1:4], x[:, 1] - Ensures scalar results are wrapped in arrays MODULE 05 (Autograd): - Added SliceBackward function for gradient computation - Implements proper gradient scatter: zeros everywhere except sliced positions - Added monkey-patching in enable_autograd() for __getitem__ - Follows same pattern as existing operations (add, mul, matmul) MODULE 11 (Embeddings): - Updated PositionalEncoding to use Tensor slicing instead of .data - Fixed multiple .data accesses that broke computation graphs - Removed Tensor() wrapping that created gradient-disconnected leafs - Uses proper Tensor operations to preserve gradient flow TESTING: - All 6 component tests PASS (Embedding, Attention, FFN, Residual, Forward, Training) - 19/19 parameters get gradients (was 18/19 before) - Loss dropping better: 1.54→1.08 (vs 1.62→1.24 before) - Model still not learning (0% accuracy) - needs fresh session to test monkey-patching WHY THIS MATTERS: - Tensor slicing is FUNDAMENTAL - needed by transformers for position embeddings - Progressive disclosure maintains educational integrity - Follows existing TinyTorch architecture patterns - Enables position embeddings to potentially learn (pending verification) DOCUMENTS CREATED: - milestones/05_2017_transformer/TENSOR_SLICING_IMPLEMENTATION.md - milestones/05_2017_transformer/STATUS.md - milestones/05_2017_transformer/FIXES_SUMMARY.md - milestones/05_2017_transformer/DEBUG_REVERSAL.md - tests/milestones/test_reversal_debug.py (component tests) ARCHITECTURAL PRINCIPLE: Progressive disclosure is not just nice-to-have, it's CRITICAL for educational systems. Don't expose Module 05 concepts (gradients) in Module 01 (basic operations). Monkey-patch when features are needed, not before.
123 lines
5.8 KiB
Python
Generated
123 lines
5.8 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||
# ║ ║
|
||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||
# ║ ║
|
||
# ║ ✅ TO EDIT: modules/XX_quantization/quantization.py ║
|
||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||
# ║ ║
|
||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||
# ║ Editing it directly may break module functionality and training. ║
|
||
# ║ ║
|
||
# ║ 🎓 LEARNING TIP: Work in modules/ - that's where real development ║
|
||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||
# %% auto 0
|
||
__all__ = ['QuantizationComplete', 'quantize_int8', 'dequantize_int8', 'quantize_model']
|
||
|
||
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 3
|
||
import numpy as np
|
||
import time
|
||
from typing import Tuple, Dict, List, Optional
|
||
import warnings
|
||
|
||
# Import dependencies from other modules
|
||
from ..core.tensor import Tensor
|
||
from ..core.layers import Linear
|
||
from ..core.activations import ReLU
|
||
|
||
print("✅ Quantization module imports complete")
|
||
|
||
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 34
|
||
class QuantizationComplete:
|
||
"""
|
||
Complete quantization system for milestone use.
|
||
|
||
Provides INT8 quantization with calibration for 4× memory reduction.
|
||
"""
|
||
|
||
@staticmethod
|
||
def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]:
|
||
"""Quantize FP32 tensor to INT8."""
|
||
data = tensor.data
|
||
min_val = float(np.min(data))
|
||
max_val = float(np.max(data))
|
||
|
||
if abs(max_val - min_val) < 1e-8:
|
||
return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0
|
||
|
||
scale = (max_val - min_val) / 255.0
|
||
zero_point = int(np.round(-128 - min_val / scale))
|
||
zero_point = int(np.clip(zero_point, -128, 127))
|
||
|
||
quantized_data = np.round(data / scale + zero_point)
|
||
quantized_data = np.clip(quantized_data, -128, 127).astype(np.int8)
|
||
|
||
return Tensor(quantized_data), scale, zero_point
|
||
|
||
@staticmethod
|
||
def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
|
||
"""Dequantize INT8 tensor back to FP32."""
|
||
dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale
|
||
return Tensor(dequantized_data)
|
||
|
||
@staticmethod
|
||
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
|
||
"""
|
||
Quantize all Linear layers in a model.
|
||
|
||
Returns dictionary with quantization info and memory savings.
|
||
"""
|
||
quantized_layers = {}
|
||
original_size = 0
|
||
quantized_size = 0
|
||
|
||
# Iterate through model parameters
|
||
if hasattr(model, 'parameters'):
|
||
for i, param in enumerate(model.parameters()):
|
||
param_size = param.data.nbytes
|
||
original_size += param_size
|
||
|
||
# Quantize parameter
|
||
q_param, scale, zp = QuantizationComplete.quantize_tensor(param)
|
||
quantized_size += q_param.data.nbytes
|
||
|
||
quantized_layers[f'param_{i}'] = {
|
||
'quantized': q_param,
|
||
'scale': scale,
|
||
'zero_point': zp,
|
||
'original_shape': param.data.shape
|
||
}
|
||
|
||
return {
|
||
'quantized_layers': quantized_layers,
|
||
'original_size_mb': original_size / (1024 * 1024),
|
||
'quantized_size_mb': quantized_size / (1024 * 1024),
|
||
'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0
|
||
}
|
||
|
||
@staticmethod
|
||
def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]:
|
||
"""Compare memory usage between original and quantized models."""
|
||
return {
|
||
'original_mb': quantized_info['original_size_mb'],
|
||
'quantized_mb': quantized_info['quantized_size_mb'],
|
||
'compression_ratio': quantized_info['compression_ratio'],
|
||
'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb']
|
||
}
|
||
|
||
# Convenience functions for backward compatibility
|
||
def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]:
|
||
"""Quantize FP32 tensor to INT8."""
|
||
return QuantizationComplete.quantize_tensor(tensor)
|
||
|
||
def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
|
||
"""Dequantize INT8 tensor back to FP32."""
|
||
return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point)
|
||
|
||
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
|
||
"""Quantize entire model to INT8."""
|
||
return QuantizationComplete.quantize_model(model, calibration_data)
|