mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-04 01:40:53 -05:00
- Added ProfilerComplete class to profiling_dev.py with all measurement methods - Exported ProfilerComplete to tinytorch/profiling/profiler.py - Created profile_kv_cache.py milestone demonstrating scientific performance measurement - Demo shows 19x speedup from KV caching with detailed profiling metrics - Validates Module 14 KV cache optimization impact quantitatively
156 lines
6.5 KiB
Python
Generated
156 lines
6.5 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
|
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
|
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
|
# ║ ║
|
|
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
|
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
|
# ║ ║
|
|
# ║ ✅ TO EDIT: modules/source/XX_profiler/profiler_dev.py ║
|
|
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
|
# ║ ║
|
|
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
|
# ║ Editing it directly may break module functionality and training. ║
|
|
# ║ ║
|
|
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
|
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
|
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
|
# %% auto 0
|
|
__all__ = ['Profiler', 'ProfilerComplete']
|
|
|
|
# %% ../../modules/source/15_profiling/profiling_dev.ipynb 1
|
|
import time
|
|
import numpy as np
|
|
import tracemalloc
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
from collections import defaultdict
|
|
import gc
|
|
|
|
# Import our TinyTorch components for profiling
|
|
from ..core.tensor import Tensor
|
|
from ..core.layers import Linear
|
|
from ..core.spatial import Conv2d
|
|
|
|
# %% ../../modules/source/15_profiling/profiling_dev.ipynb 5
|
|
class Profiler:
|
|
"""
|
|
Professional-grade ML model profiler for performance analysis.
|
|
|
|
Measures parameters, FLOPs, memory usage, and latency with statistical rigor.
|
|
Used for optimization guidance and deployment planning.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize profiler with measurement state."""
|
|
### BEGIN SOLUTION
|
|
self.measurements = {}
|
|
self.operation_counts = defaultdict(int)
|
|
self.memory_tracker = None
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/15_profiling/profiling_dev.ipynb 37
|
|
class ProfilerComplete:
|
|
"""
|
|
Complete profiler with all measurement capabilities for milestone use.
|
|
|
|
This is the exported version students build through the module exercises.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize profiler with measurement state."""
|
|
self.measurements = {}
|
|
self.operation_counts = defaultdict(int)
|
|
self.memory_tracker = None
|
|
|
|
def count_parameters(self, model) -> int:
|
|
"""Count total trainable parameters in a model."""
|
|
total_params = 0
|
|
|
|
if hasattr(model, 'parameters'):
|
|
for param in model.parameters():
|
|
total_params += param.data.size
|
|
elif hasattr(model, 'weight'):
|
|
total_params += model.weight.data.size
|
|
if hasattr(model, 'bias') and model.bias is not None:
|
|
total_params += model.bias.data.size
|
|
|
|
return total_params
|
|
|
|
def count_flops(self, model, input_shape: Tuple[int, ...]) -> int:
|
|
"""Count FLOPs for one forward pass."""
|
|
dummy_input = Tensor(np.random.randn(*input_shape))
|
|
total_flops = 0
|
|
|
|
if hasattr(model, '__class__'):
|
|
model_name = model.__class__.__name__
|
|
|
|
if model_name == 'Linear':
|
|
in_features = input_shape[-1]
|
|
out_features = model.weight.shape[1] if hasattr(model, 'weight') else 1
|
|
total_flops = in_features * out_features * 2
|
|
|
|
elif model_name == 'Conv2d':
|
|
total_flops = 1000000 # Simplified for now
|
|
|
|
return total_flops
|
|
|
|
def measure_memory(self, model, input_shape: Tuple[int, ...]) -> Dict[str, float]:
|
|
"""Measure memory usage during forward pass."""
|
|
tracemalloc.start()
|
|
baseline_memory = tracemalloc.get_traced_memory()[0]
|
|
|
|
param_count = self.count_parameters(model)
|
|
parameter_memory_bytes = param_count * 4
|
|
parameter_memory_mb = parameter_memory_bytes / (1024 * 1024)
|
|
|
|
dummy_input = Tensor(np.random.randn(*input_shape))
|
|
|
|
try:
|
|
if hasattr(model, 'forward'):
|
|
output = model.forward(dummy_input)
|
|
elif hasattr(model, '__call__'):
|
|
output = model(dummy_input)
|
|
except:
|
|
output = dummy_input
|
|
|
|
peak_memory, _ = tracemalloc.get_traced_memory()
|
|
tracemalloc.stop()
|
|
|
|
peak_memory_mb = peak_memory / (1024 * 1024)
|
|
activation_memory_mb = max(0, peak_memory_mb - parameter_memory_mb)
|
|
|
|
return {
|
|
'parameter_memory_mb': parameter_memory_mb,
|
|
'activation_memory_mb': activation_memory_mb,
|
|
'peak_memory_mb': peak_memory_mb,
|
|
'memory_efficiency': parameter_memory_mb / peak_memory_mb if peak_memory_mb > 0 else 0
|
|
}
|
|
|
|
def measure_latency(self, model, input_tensor, warmup: int = 10, iterations: int = 100) -> float:
|
|
"""Measure model inference latency with statistical rigor."""
|
|
# Warmup
|
|
for _ in range(warmup):
|
|
try:
|
|
if hasattr(model, 'forward'):
|
|
_ = model.forward(input_tensor)
|
|
elif hasattr(model, '__call__'):
|
|
_ = model(input_tensor)
|
|
except:
|
|
pass
|
|
|
|
# Measurement
|
|
times = []
|
|
for _ in range(iterations):
|
|
start = time.perf_counter()
|
|
try:
|
|
if hasattr(model, 'forward'):
|
|
_ = model.forward(input_tensor)
|
|
elif hasattr(model, '__call__'):
|
|
_ = model(input_tensor)
|
|
except:
|
|
pass
|
|
end = time.perf_counter()
|
|
times.append(end - start)
|
|
|
|
median_latency_ms = np.median(times) * 1000
|
|
return median_latency_ms
|