mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-02 01:51:06 -05:00
Rename ProfilerComplete to Profiler for cleaner API
- Updated all imports: ProfilerComplete → Profiler - Updated Module 16: Uses Profiler for acceleration demos - Updated Module 19: Uses Profiler in Benchmark class - Updated all comments and docstrings - Simpler, more professional naming (no awkward Complete suffix)
This commit is contained in:
@@ -880,6 +880,103 @@ def analyze_acceleration_decision_framework():
|
||||
|
||||
analyze_acceleration_decision_framework()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 5.5 Measuring Acceleration Gains with Profiler
|
||||
|
||||
Now let's use the **Profiler** tool you built in Module 15 to measure the actual performance improvements from vectorization. This demonstrates the full workflow: build profiling tools (M15), apply optimizations (M16), measure gains (M15+M16).
|
||||
|
||||
This is how professional ML engineers work: profile → optimize → measure → repeat.
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "demo-profiler-acceleration", "solution": true}
|
||||
# Import Profiler from Module 15
|
||||
from tinytorch.profiling.profiler import Profiler
|
||||
|
||||
def demo_acceleration_with_profiler():
|
||||
"""📊 Demonstrate acceleration gains using Profiler from Module 15."""
|
||||
print("📊 Measuring Acceleration Gains with Profiler")
|
||||
print("=" * 70)
|
||||
|
||||
profiler = Profiler()
|
||||
|
||||
# Create two simple models: one slow (loop-based), one fast (vectorized)
|
||||
class SlowLinear:
|
||||
"""Linear layer using explicit loops (slow)."""
|
||||
def __init__(self, in_features, out_features):
|
||||
self.weight = Tensor(np.random.randn(in_features, out_features).astype(np.float32) * 0.01)
|
||||
self.name = "slow_linear"
|
||||
|
||||
def forward(self, x):
|
||||
# Explicit loop implementation (for demonstration)
|
||||
batch_size = x.shape[0]
|
||||
out_features = self.weight.shape[1]
|
||||
result = np.zeros((batch_size, out_features), dtype=np.float32)
|
||||
|
||||
for i in range(batch_size):
|
||||
for j in range(out_features):
|
||||
for k in range(x.shape[1]):
|
||||
result[i, j] += x.data[i, k] * self.weight.data[k, j]
|
||||
|
||||
return Tensor(result)
|
||||
|
||||
class FastLinear:
|
||||
"""Linear layer using vectorized matmul (fast)."""
|
||||
def __init__(self, in_features, out_features):
|
||||
self.weight = Tensor(np.random.randn(in_features, out_features).astype(np.float32) * 0.01)
|
||||
self.name = "fast_linear"
|
||||
|
||||
def forward(self, x):
|
||||
# Vectorized implementation
|
||||
return vectorized_matmul(x, self.weight)
|
||||
|
||||
in_features, out_features = 128, 64
|
||||
batch_size = 32
|
||||
|
||||
# Create models
|
||||
slow_model = SlowLinear(in_features, out_features)
|
||||
fast_model = FastLinear(in_features, out_features)
|
||||
|
||||
# Create input
|
||||
input_tensor = Tensor(np.random.randn(batch_size, in_features).astype(np.float32))
|
||||
|
||||
print("\n🐢 BEFORE: Loop-based implementation")
|
||||
print("-" * 70)
|
||||
|
||||
# Measure slow model
|
||||
slow_latency = profiler.measure_latency(slow_model, input_tensor, warmup=3, iterations=10)
|
||||
slow_flops = profiler.count_flops(slow_model, (batch_size, in_features))
|
||||
|
||||
print(f" Latency: {slow_latency:.2f} ms")
|
||||
print(f" FLOPs: {slow_flops:,}")
|
||||
print(f" Throughput: {slow_flops / (slow_latency / 1000) / 1e9:.2f} GFLOP/s")
|
||||
|
||||
print("\n🚀 AFTER: Vectorized implementation")
|
||||
print("-" * 70)
|
||||
|
||||
# Measure fast model
|
||||
fast_latency = profiler.measure_latency(fast_model, input_tensor, warmup=3, iterations=10)
|
||||
fast_flops = profiler.count_flops(fast_model, (batch_size, in_features))
|
||||
|
||||
print(f" Latency: {fast_latency:.2f} ms")
|
||||
print(f" FLOPs: {fast_flops:,}")
|
||||
print(f" Throughput: {fast_flops / (fast_latency / 1000) / 1e9:.2f} GFLOP/s")
|
||||
|
||||
print("\n📈 ACCELERATION GAINS")
|
||||
print("=" * 70)
|
||||
speedup = slow_latency / fast_latency
|
||||
print(f" Speedup: {speedup:.1f}x faster")
|
||||
print(f" Time saved: {slow_latency - fast_latency:.2f} ms per inference")
|
||||
print(f" Throughput improvement: {speedup:.1f}x more inferences/second")
|
||||
|
||||
print("\n💡 Key Insight:")
|
||||
print(f" Vectorization with numpy.matmul leverages optimized BLAS libraries")
|
||||
print(f" that use SIMD instructions and cache-friendly memory access patterns.")
|
||||
print(f" This is why {speedup:.0f}x speedups are possible with the same FLOPs!")
|
||||
print("\n✅ This is the power of acceleration: same math, different execution!")
|
||||
|
||||
demo_acceleration_with_profiler()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 6. Module Integration Test
|
||||
|
||||
@@ -155,7 +155,7 @@ from contextlib import contextmanager
|
||||
import warnings
|
||||
|
||||
# Import Profiler from Module 15 for measurement reuse
|
||||
from tinytorch.profiling.profiler import ProfilerComplete
|
||||
from tinytorch.profiling.profiler import Profiler
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -165,18 +165,18 @@ We'll build a comprehensive benchmarking system that handles statistical analysi
|
||||
|
||||
The architecture follows a hierarchical design:
|
||||
```
|
||||
ProfilerComplete (Module 15) ← Base measurement tools
|
||||
Profiler (Module 15) ← Base measurement tools
|
||||
↓
|
||||
BenchmarkResult ← Statistical container for measurements
|
||||
↓
|
||||
Benchmark ← Uses ProfilerComplete + adds multi-model comparison
|
||||
Benchmark ← Uses Profiler + adds multi-model comparison
|
||||
↓
|
||||
BenchmarkSuite ← Multi-metric comprehensive evaluation
|
||||
↓
|
||||
TinyMLPerf ← Standardized industry-style benchmarks
|
||||
```
|
||||
|
||||
**Key Architectural Decision**: The `Benchmark` class reuses `ProfilerComplete` from Module 15 for individual model measurements, then adds statistical comparison across multiple models. This demonstrates proper systems architecture - build once, reuse everywhere!
|
||||
**Key Architectural Decision**: The `Benchmark` class reuses `Profiler` from Module 15 for individual model measurements, then adds statistical comparison across multiple models. This demonstrates proper systems architecture - build once, reuse everywhere!
|
||||
|
||||
Each level adds capability while maintaining statistical rigor at the foundation.
|
||||
"""
|
||||
@@ -521,8 +521,8 @@ class Benchmark:
|
||||
self.measurement_runs = measurement_runs
|
||||
self.results = {}
|
||||
|
||||
# Use ProfilerComplete from Module 15 for measurements
|
||||
self.profiler = ProfilerComplete()
|
||||
# Use Profiler from Module 15 for measurements
|
||||
self.profiler = Profiler()
|
||||
|
||||
# System information for metadata
|
||||
self.system_info = {
|
||||
@@ -534,7 +534,7 @@ class Benchmark:
|
||||
}
|
||||
|
||||
def run_latency_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]:
|
||||
"""Benchmark model inference latency using ProfilerComplete."""
|
||||
"""Benchmark model inference latency using Profiler."""
|
||||
results = {}
|
||||
|
||||
for i, model in enumerate(self.models):
|
||||
@@ -548,7 +548,7 @@ class Benchmark:
|
||||
# Fallback for simple models
|
||||
input_tensor = np.random.randn(*input_shape).astype(np.float32)
|
||||
|
||||
# Use ProfilerComplete to measure latency with proper warmup and iterations
|
||||
# Use Profiler to measure latency with proper warmup and iterations
|
||||
try:
|
||||
latency_ms = self.profiler.measure_latency(
|
||||
model,
|
||||
@@ -557,7 +557,7 @@ class Benchmark:
|
||||
iterations=self.measurement_runs
|
||||
)
|
||||
|
||||
# ProfilerComplete returns single median value
|
||||
# Profiler returns single median value
|
||||
# For BenchmarkResult, we need multiple measurements
|
||||
# Run additional measurements for statistical analysis
|
||||
latencies = []
|
||||
@@ -628,7 +628,7 @@ class Benchmark:
|
||||
return results
|
||||
|
||||
def run_memory_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]:
|
||||
"""Benchmark model memory usage using ProfilerComplete."""
|
||||
"""Benchmark model memory usage using Profiler."""
|
||||
results = {}
|
||||
|
||||
for i, model in enumerate(self.models):
|
||||
@@ -637,7 +637,7 @@ class Benchmark:
|
||||
|
||||
for run in range(self.measurement_runs):
|
||||
try:
|
||||
# Use ProfilerComplete to measure memory
|
||||
# Use Profiler to measure memory
|
||||
memory_stats = self.profiler.measure_memory(model, input_shape)
|
||||
# Use peak_memory_mb as the primary metric
|
||||
memory_used = memory_stats['peak_memory_mb']
|
||||
|
||||
Reference in New Issue
Block a user