Files
TinyTorch/tinytorch/profiling/profiler.py
Vijay Janapa Reddi 43ea5f9a65 Fix MLPerf milestone metrics: FLOPs calculation, quantization compression ratio, pruning delta sign
- Fixed FLOPs calculation to handle models with .layers attribute (not just Sequential)
- Fixed quantization compression ratio to calculate theoretical INT8 size (1 byte per element)
- Fixed pruning accuracy delta sign to correctly show +/- direction
- Added missing export directives for Tensor and numpy imports in acceleration module

Results now correctly show:
- FLOPs: 4,736 (was incorrectly showing 64)
- Quantization: 4.0x compression (was incorrectly showing 1.0x)
- Pruning delta: correct +/- sign based on actual accuracy change
2025-12-03 09:36:10 -08:00

615 lines
24 KiB
Python
Generated
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: src/XX_profiler/XX_profiler.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['BYTES_PER_FLOAT32', 'KB_TO_BYTES', 'MB_TO_BYTES', 'Profiler', 'quick_profile', 'analyze_weight_distribution']
# %% ../../modules/14_profiling/14_profiling.ipynb 1
import sys
import os
import time
import numpy as np
import tracemalloc
from typing import Dict, List, Any, Optional, Tuple
from collections import defaultdict
import gc
# Import from TinyTorch package (previous modules must be completed and exported)
from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.spatial import Conv2d
# Constants for memory and performance measurement
BYTES_PER_FLOAT32 = 4 # Standard float32 size in bytes
KB_TO_BYTES = 1024 # Kilobytes to bytes conversion
MB_TO_BYTES = 1024 * 1024 # Megabytes to bytes conversion
# %% ../../modules/14_profiling/14_profiling.ipynb 6
class Profiler:
"""
Professional-grade ML model profiler for performance analysis.
Measures parameters, FLOPs, memory usage, and latency with statistical rigor.
Used for optimization guidance and deployment planning.
"""
def __init__(self):
"""
Initialize profiler with measurement state.
TODO: Set up profiler tracking structures
APPROACH:
1. Create empty measurements dictionary
2. Initialize operation counters
3. Set up memory tracking state
EXAMPLE:
>>> profiler = Profiler()
>>> profiler.measurements
{}
HINTS:
- Use defaultdict(int) for operation counters
- measurements dict will store timing results
"""
### BEGIN SOLUTION
self.measurements = {}
self.operation_counts = defaultdict(int)
self.memory_tracker = None
### END SOLUTION
def count_parameters(self, model) -> int:
"""
Count total trainable parameters in a model.
TODO: Implement parameter counting for any model with parameters() method
APPROACH:
1. Get all parameters from model.parameters() if available
2. For single layers, count weight and bias directly
3. Sum total element count across all parameter tensors
EXAMPLE:
>>> linear = Linear(128, 64) # 128*64 + 64 = 8256 parameters
>>> profiler = Profiler()
>>> count = profiler.count_parameters(linear)
>>> print(count)
8256
HINTS:
- Use parameter.data.size for tensor element count
- Handle models with and without parameters() method
- Don't forget bias terms when present
"""
### BEGIN SOLUTION
total_params = 0
# Handle SimpleModel pattern (has .layers attribute)
if hasattr(model, 'layers'):
# SimpleModel: iterate through layers
for layer in model.layers:
for param in layer.parameters():
total_params += param.data.size
elif hasattr(model, 'parameters'):
# Model with direct parameters() method
for param in model.parameters():
total_params += param.data.size
elif hasattr(model, 'weight'):
# Single layer (Linear, Conv2d) - all have .weight
total_params += model.weight.data.size
# Check for bias (may be None)
if hasattr(model, 'bias') and model.bias is not None:
total_params += model.bias.data.size
else:
# No parameters (activations, etc.)
total_params = 0
return total_params
### END SOLUTION
def count_flops(self, model, input_shape: Tuple[int, ...]) -> int:
"""
Count FLOPs (Floating Point Operations) for one forward pass.
TODO: Implement FLOP counting for different layer types
APPROACH:
1. Create dummy input with given shape
2. Calculate FLOPs based on layer type and dimensions
3. Handle different model architectures (Linear, Conv2d, Sequential)
LAYER-SPECIFIC FLOP FORMULAS:
- Linear: input_features × output_features × 2 (matmul + bias)
- Conv2d: output_h × output_w × kernel_h × kernel_w × in_channels × out_channels × 2
- Activation: Usually 1 FLOP per element (ReLU, Sigmoid)
EXAMPLE:
>>> linear = Linear(128, 64)
>>> profiler = Profiler()
>>> flops = profiler.count_flops(linear, (1, 128))
>>> print(flops) # 128 * 64 * 2 = 16384
16384
HINTS:
- Batch dimension doesn't affect per-sample FLOPs
- Focus on major operations (matmul, conv) first
- For Sequential models, sum FLOPs of all layers
"""
### BEGIN SOLUTION
# Create dummy input (unused but kept for interface consistency)
_dummy_input = Tensor(np.random.randn(*input_shape))
total_flops = 0
# Handle different model types
if hasattr(model, '__class__'):
model_name = model.__class__.__name__
if model_name == 'Linear':
# Linear layer: input_features × output_features × 2
in_features = input_shape[-1]
out_features = model.weight.shape[1] if hasattr(model, 'weight') else 1
total_flops = in_features * out_features * 2
elif model_name == 'Conv2d':
# Conv2d layer: complex calculation based on output size
# Simplified: assume we know the output dimensions
if hasattr(model, 'kernel_size') and hasattr(model, 'in_channels'):
_batch_size = input_shape[0] if len(input_shape) > 3 else 1
in_channels = model.in_channels
out_channels = model.out_channels
kernel_h = kernel_w = model.kernel_size
# Estimate output size (simplified)
input_h, input_w = input_shape[-2], input_shape[-1]
output_h = input_h // (model.stride if hasattr(model, 'stride') else 1)
output_w = input_w // (model.stride if hasattr(model, 'stride') else 1)
total_flops = (output_h * output_w * kernel_h * kernel_w *
in_channels * out_channels * 2)
elif model_name == 'Sequential' or hasattr(model, 'layers'):
# Sequential model or model with layers: sum FLOPs of all layers
current_shape = input_shape
for layer in model.layers:
layer_flops = self.count_flops(layer, current_shape)
total_flops += layer_flops
# Update shape for next layer (simplified)
if hasattr(layer, 'weight'):
current_shape = current_shape[:-1] + (layer.weight.shape[1],)
else:
# Activation or other: assume 1 FLOP per element
total_flops = np.prod(input_shape)
return total_flops
### END SOLUTION
def measure_memory(self, model, input_shape: Tuple[int, ...]) -> Dict[str, float]:
"""
Measure memory usage during forward pass.
TODO: Implement memory tracking for model execution
APPROACH:
1. Use tracemalloc to track memory allocation
2. Measure baseline memory before model execution
3. Run forward pass and track peak usage
4. Calculate different memory components
RETURN DICTIONARY:
- 'parameter_memory_mb': Memory for model parameters
- 'activation_memory_mb': Memory for activations
- 'peak_memory_mb': Maximum memory usage
- 'memory_efficiency': Ratio of useful to total memory
EXAMPLE:
>>> linear = Linear(1024, 512)
>>> profiler = Profiler()
>>> memory = profiler.measure_memory(linear, (32, 1024))
>>> print(f"Parameters: {memory['parameter_memory_mb']:.1f} MB")
Parameters: 2.1 MB
HINTS:
- Use tracemalloc.start() and tracemalloc.get_traced_memory()
- Account for float32 = 4 bytes per parameter
- Activation memory scales with batch size
"""
### BEGIN SOLUTION
# Start memory tracking
tracemalloc.start()
# Measure baseline memory (unused but kept for completeness)
_baseline_memory = tracemalloc.get_traced_memory()[0]
# Calculate parameter memory
param_count = self.count_parameters(model)
parameter_memory_bytes = param_count * BYTES_PER_FLOAT32
parameter_memory_mb = parameter_memory_bytes / MB_TO_BYTES
# Create input and measure activation memory
dummy_input = Tensor(np.random.randn(*input_shape))
input_memory_bytes = dummy_input.data.nbytes
# Estimate activation memory (simplified)
activation_memory_bytes = input_memory_bytes * 2 # Rough estimate
activation_memory_mb = activation_memory_bytes / MB_TO_BYTES
# Run forward pass to measure peak memory usage
_ = model.forward(dummy_input)
# Get peak memory
_current_memory, peak_memory = tracemalloc.get_traced_memory()
peak_memory_mb = (peak_memory - _baseline_memory) / MB_TO_BYTES
tracemalloc.stop()
# Calculate efficiency
useful_memory = parameter_memory_mb + activation_memory_mb
memory_efficiency = useful_memory / max(peak_memory_mb, 0.001) # Avoid division by zero
return {
'parameter_memory_mb': parameter_memory_mb,
'activation_memory_mb': activation_memory_mb,
'peak_memory_mb': max(peak_memory_mb, useful_memory),
'memory_efficiency': min(memory_efficiency, 1.0)
}
### END SOLUTION
def measure_latency(self, model, input_tensor, warmup: int = 10, iterations: int = 100) -> float:
"""
Measure model inference latency with statistical rigor.
TODO: Implement accurate latency measurement
APPROACH:
1. Run warmup iterations to stabilize performance
2. Measure multiple iterations for statistical accuracy
3. Calculate median latency to handle outliers
4. Return latency in milliseconds
PARAMETERS:
- warmup: Number of warmup runs (default 10)
- iterations: Number of measurement runs (default 100)
EXAMPLE:
>>> linear = Linear(128, 64)
>>> input_tensor = Tensor(np.random.randn(1, 128))
>>> profiler = Profiler()
>>> latency = profiler.measure_latency(linear, input_tensor)
>>> print(f"Latency: {latency:.2f} ms")
Latency: 0.15 ms
HINTS:
- Use time.perf_counter() for high precision
- Use median instead of mean for robustness against outliers
- Handle different model interfaces (forward, __call__)
"""
### BEGIN SOLUTION
# Warmup runs to stabilize performance
for _ in range(warmup):
_ = model.forward(input_tensor)
# Measurement runs
times = []
for _ in range(iterations):
start_time = time.perf_counter()
_ = model.forward(input_tensor)
end_time = time.perf_counter()
times.append((end_time - start_time) * 1000) # Convert to milliseconds
# Calculate statistics - use median for robustness
times = np.array(times)
median_latency = np.median(times)
return float(median_latency)
### END SOLUTION
def profile_layer(self, layer, input_shape: Tuple[int, ...]) -> Dict[str, Any]:
"""
Profile a single layer comprehensively.
TODO: Implement layer-wise profiling
APPROACH:
1. Count parameters for this layer
2. Count FLOPs for this layer
3. Measure memory usage
4. Measure latency
5. Return comprehensive layer profile
EXAMPLE:
>>> linear = Linear(256, 128)
>>> profiler = Profiler()
>>> profile = profiler.profile_layer(linear, (32, 256))
>>> print(f"Layer uses {profile['parameters']} parameters")
Layer uses 32896 parameters
HINTS:
- Use existing profiler methods (count_parameters, count_flops, etc.)
- Create dummy input for latency measurement
- Include layer type information in profile
"""
### BEGIN SOLUTION
# Create dummy input for latency measurement
dummy_input = Tensor(np.random.randn(*input_shape))
# Gather all measurements
params = self.count_parameters(layer)
flops = self.count_flops(layer, input_shape)
memory = self.measure_memory(layer, input_shape)
latency = self.measure_latency(layer, dummy_input, warmup=3, iterations=10)
# Compute derived metrics
gflops_per_second = (flops / 1e9) / max(latency / 1000, 1e-6)
return {
'layer_type': layer.__class__.__name__,
'parameters': params,
'flops': flops,
'latency_ms': latency,
'gflops_per_second': gflops_per_second,
**memory
}
### END SOLUTION
def profile_forward_pass(self, model, input_tensor) -> Dict[str, Any]:
"""
Comprehensive profiling of a model's forward pass.
TODO: Implement complete forward pass analysis
APPROACH:
1. Use Profiler class to gather all measurements
2. Create comprehensive performance profile
3. Add derived metrics and insights
4. Return structured analysis results
RETURN METRICS:
- All basic profiler measurements
- FLOPs per second (computational efficiency)
- Memory bandwidth utilization
- Performance bottleneck identification
EXAMPLE:
>>> model = Linear(256, 128)
>>> input_data = Tensor(np.random.randn(32, 256))
>>> profiler = Profiler()
>>> profile = profiler.profile_forward_pass(model, input_data)
>>> print(f"Throughput: {profile['gflops_per_second']:.2f} GFLOP/s")
Throughput: 2.45 GFLOP/s
HINTS:
- GFLOP/s = (FLOPs / 1e9) / (latency_ms / 1000)
- Memory bandwidth = memory_mb / (latency_ms / 1000)
- Consider realistic hardware limits for efficiency calculations
"""
### BEGIN SOLUTION
# Basic measurements
param_count = self.count_parameters(model)
flops = self.count_flops(model, input_tensor.shape)
memory_stats = self.measure_memory(model, input_tensor.shape)
latency_ms = self.measure_latency(model, input_tensor, warmup=5, iterations=20)
# Derived metrics
latency_seconds = latency_ms / 1000.0
gflops_per_second = (flops / 1e9) / max(latency_seconds, 1e-6)
# Memory bandwidth (MB/s)
memory_bandwidth = memory_stats['peak_memory_mb'] / max(latency_seconds, 1e-6)
# Efficiency metrics
theoretical_peak_gflops = 100.0 # Assume 100 GFLOP/s theoretical peak for CPU
computational_efficiency = min(gflops_per_second / theoretical_peak_gflops, 1.0)
# Bottleneck analysis
is_memory_bound = memory_bandwidth > gflops_per_second * 100 # Rough heuristic
is_compute_bound = not is_memory_bound
return {
# Basic measurements
'parameters': param_count,
'flops': flops,
'latency_ms': latency_ms,
**memory_stats,
# Derived metrics
'gflops_per_second': gflops_per_second,
'memory_bandwidth_mbs': memory_bandwidth,
'computational_efficiency': computational_efficiency,
# Bottleneck analysis
'is_memory_bound': is_memory_bound,
'is_compute_bound': is_compute_bound,
'bottleneck': 'memory' if is_memory_bound else 'compute'
}
### END SOLUTION
def profile_backward_pass(self, model, input_tensor, _loss_fn=None) -> Dict[str, Any]:
"""
Profile both forward and backward passes for training analysis.
TODO: Implement training-focused profiling
APPROACH:
1. Profile forward pass first
2. Estimate backward pass costs (typically 2× forward)
3. Calculate total training iteration metrics
4. Analyze memory requirements for gradients and optimizers
BACKWARD PASS ESTIMATES:
- FLOPs: ~2× forward pass (gradient computation)
- Memory: +1× parameters (gradient storage)
- Latency: ~2× forward pass (more complex operations)
EXAMPLE:
>>> model = Linear(128, 64)
>>> input_data = Tensor(np.random.randn(16, 128))
>>> profiler = Profiler()
>>> profile = profiler.profile_backward_pass(model, input_data)
>>> print(f"Training iteration: {profile['total_latency_ms']:.2f} ms")
Training iteration: 0.45 ms
HINTS:
- Total memory = parameters + activations + gradients
- Optimizer memory depends on algorithm (SGD: 0×, Adam: 2×)
- Consider gradient accumulation effects
"""
### BEGIN SOLUTION
# Get forward pass profile
forward_profile = self.profile_forward_pass(model, input_tensor)
# Estimate backward pass (typically 2× forward)
backward_flops = forward_profile['flops'] * 2
backward_latency_ms = forward_profile['latency_ms'] * 2
# Gradient memory (equal to parameter memory)
gradient_memory_mb = forward_profile['parameter_memory_mb']
# Total training iteration
total_flops = forward_profile['flops'] + backward_flops
total_latency_ms = forward_profile['latency_ms'] + backward_latency_ms
total_memory_mb = (forward_profile['parameter_memory_mb'] +
forward_profile['activation_memory_mb'] +
gradient_memory_mb)
# Training efficiency
total_gflops_per_second = (total_flops / 1e9) / (total_latency_ms / 1000.0)
# Optimizer memory estimates
optimizer_memory_estimates = {
'sgd': 0, # No extra memory
'adam': gradient_memory_mb * 2, # Momentum + velocity
'adamw': gradient_memory_mb * 2, # Same as Adam
}
return {
# Forward pass
'forward_flops': forward_profile['flops'],
'forward_latency_ms': forward_profile['latency_ms'],
'forward_memory_mb': forward_profile['peak_memory_mb'],
# Backward pass estimates
'backward_flops': backward_flops,
'backward_latency_ms': backward_latency_ms,
'gradient_memory_mb': gradient_memory_mb,
# Total training iteration
'total_flops': total_flops,
'total_latency_ms': total_latency_ms,
'total_memory_mb': total_memory_mb,
'total_gflops_per_second': total_gflops_per_second,
# Optimizer memory requirements
'optimizer_memory_estimates': optimizer_memory_estimates,
# Training insights
'memory_efficiency': forward_profile['memory_efficiency'],
'bottleneck': forward_profile['bottleneck']
}
### END SOLUTION
# %% ../../modules/14_profiling/14_profiling.ipynb 8
def quick_profile(model, input_tensor, profiler=None):
"""
Quick profiling function for immediate insights.
Provides a simplified interface for profiling that displays key metrics
in a student-friendly format.
Args:
model: Model to profile
input_tensor: Input data for profiling
profiler: Optional Profiler instance (creates new one if None)
Returns:
dict: Profile results with key metrics
Example:
>>> model = Linear(128, 64)
>>> input_data = Tensor(np.random.randn(16, 128))
>>> results = quick_profile(model, input_data)
>>> # Displays formatted output automatically
"""
if profiler is None:
profiler = Profiler()
profile = profiler.profile_forward_pass(model, input_tensor)
# Display formatted results
print("🔬 Quick Profile Results:")
print(f" Parameters: {profile['parameters']:,}")
print(f" FLOPs: {profile['flops']:,}")
print(f" Latency: {profile['latency_ms']:.2f} ms")
print(f" Memory: {profile['peak_memory_mb']:.2f} MB")
print(f" Bottleneck: {profile['bottleneck']}")
print(f" Efficiency: {profile['computational_efficiency']*100:.1f}%")
return profile
# %% ../../modules/14_profiling/14_profiling.ipynb 9
def analyze_weight_distribution(model, percentiles=[10, 25, 50, 75, 90]):
"""
Analyze weight distribution for compression insights.
Helps understand which weights are small and might be prunable.
Used by Module 17 (Compression) to motivate pruning.
Args:
model: Model to analyze
percentiles: List of percentiles to compute
Returns:
dict: Weight distribution statistics
Example:
>>> model = Linear(512, 512)
>>> stats = analyze_weight_distribution(model)
>>> print(f"Weights < 0.01: {stats['below_threshold_001']:.1f}%")
"""
# Collect all weights
weights = []
if hasattr(model, 'parameters'):
for param in model.parameters():
weights.extend(param.data.flatten().tolist())
elif hasattr(model, 'weight'):
weights.extend(model.weight.data.flatten().tolist())
else:
return {'error': 'No weights found'}
weights = np.array(weights)
abs_weights = np.abs(weights)
# Calculate statistics
stats = {
'total_weights': len(weights),
'mean': float(np.mean(abs_weights)),
'std': float(np.std(abs_weights)),
'min': float(np.min(abs_weights)),
'max': float(np.max(abs_weights)),
}
# Percentile analysis
for p in percentiles:
stats[f'percentile_{p}'] = float(np.percentile(abs_weights, p))
# Threshold analysis (useful for pruning)
for threshold in [0.001, 0.01, 0.1]:
below = np.sum(abs_weights < threshold) / len(weights) * 100
stats[f'below_threshold_{str(threshold).replace(".", "")}'] = below
return stats