Rename ProfilerComplete to Profiler for cleaner API

- Updated all imports: ProfilerComplete → Profiler
- Updated Module 16: Uses Profiler for acceleration demos
- Updated Module 19: Uses Profiler in Benchmark class
- Updated all comments and docstrings
- Simpler, more professional naming (no awkward Complete suffix)
This commit is contained in:
Vijay Janapa Reddi
2025-11-06 20:35:21 -05:00
parent 96d0fc50db
commit 4ef3cb90bc
2 changed files with 108 additions and 11 deletions

View File

@@ -880,6 +880,103 @@ def analyze_acceleration_decision_framework():
analyze_acceleration_decision_framework()
# %% [markdown]
"""
## 5.5 Measuring Acceleration Gains with Profiler
Now let's use the **Profiler** tool you built in Module 15 to measure the actual performance improvements from vectorization. This demonstrates the full workflow: build profiling tools (M15), apply optimizations (M16), measure gains (M15+M16).
This is how professional ML engineers work: profile → optimize → measure → repeat.
"""
# %% nbgrader={"grade": false, "grade_id": "demo-profiler-acceleration", "solution": true}
# Import Profiler from Module 15
from tinytorch.profiling.profiler import Profiler
def demo_acceleration_with_profiler():
"""📊 Demonstrate acceleration gains using Profiler from Module 15."""
print("📊 Measuring Acceleration Gains with Profiler")
print("=" * 70)
profiler = Profiler()
# Create two simple models: one slow (loop-based), one fast (vectorized)
class SlowLinear:
"""Linear layer using explicit loops (slow)."""
def __init__(self, in_features, out_features):
self.weight = Tensor(np.random.randn(in_features, out_features).astype(np.float32) * 0.01)
self.name = "slow_linear"
def forward(self, x):
# Explicit loop implementation (for demonstration)
batch_size = x.shape[0]
out_features = self.weight.shape[1]
result = np.zeros((batch_size, out_features), dtype=np.float32)
for i in range(batch_size):
for j in range(out_features):
for k in range(x.shape[1]):
result[i, j] += x.data[i, k] * self.weight.data[k, j]
return Tensor(result)
class FastLinear:
"""Linear layer using vectorized matmul (fast)."""
def __init__(self, in_features, out_features):
self.weight = Tensor(np.random.randn(in_features, out_features).astype(np.float32) * 0.01)
self.name = "fast_linear"
def forward(self, x):
# Vectorized implementation
return vectorized_matmul(x, self.weight)
in_features, out_features = 128, 64
batch_size = 32
# Create models
slow_model = SlowLinear(in_features, out_features)
fast_model = FastLinear(in_features, out_features)
# Create input
input_tensor = Tensor(np.random.randn(batch_size, in_features).astype(np.float32))
print("\n🐢 BEFORE: Loop-based implementation")
print("-" * 70)
# Measure slow model
slow_latency = profiler.measure_latency(slow_model, input_tensor, warmup=3, iterations=10)
slow_flops = profiler.count_flops(slow_model, (batch_size, in_features))
print(f" Latency: {slow_latency:.2f} ms")
print(f" FLOPs: {slow_flops:,}")
print(f" Throughput: {slow_flops / (slow_latency / 1000) / 1e9:.2f} GFLOP/s")
print("\n🚀 AFTER: Vectorized implementation")
print("-" * 70)
# Measure fast model
fast_latency = profiler.measure_latency(fast_model, input_tensor, warmup=3, iterations=10)
fast_flops = profiler.count_flops(fast_model, (batch_size, in_features))
print(f" Latency: {fast_latency:.2f} ms")
print(f" FLOPs: {fast_flops:,}")
print(f" Throughput: {fast_flops / (fast_latency / 1000) / 1e9:.2f} GFLOP/s")
print("\n📈 ACCELERATION GAINS")
print("=" * 70)
speedup = slow_latency / fast_latency
print(f" Speedup: {speedup:.1f}x faster")
print(f" Time saved: {slow_latency - fast_latency:.2f} ms per inference")
print(f" Throughput improvement: {speedup:.1f}x more inferences/second")
print("\n💡 Key Insight:")
print(f" Vectorization with numpy.matmul leverages optimized BLAS libraries")
print(f" that use SIMD instructions and cache-friendly memory access patterns.")
print(f" This is why {speedup:.0f}x speedups are possible with the same FLOPs!")
print("\n✅ This is the power of acceleration: same math, different execution!")
demo_acceleration_with_profiler()
# %% [markdown]
"""
## 6. Module Integration Test

View File

@@ -155,7 +155,7 @@ from contextlib import contextmanager
import warnings
# Import Profiler from Module 15 for measurement reuse
from tinytorch.profiling.profiler import ProfilerComplete
from tinytorch.profiling.profiler import Profiler
# %% [markdown]
"""
@@ -165,18 +165,18 @@ We'll build a comprehensive benchmarking system that handles statistical analysi
The architecture follows a hierarchical design:
```
ProfilerComplete (Module 15) ← Base measurement tools
Profiler (Module 15) ← Base measurement tools
BenchmarkResult ← Statistical container for measurements
Benchmark ← Uses ProfilerComplete + adds multi-model comparison
Benchmark ← Uses Profiler + adds multi-model comparison
BenchmarkSuite ← Multi-metric comprehensive evaluation
TinyMLPerf ← Standardized industry-style benchmarks
```
**Key Architectural Decision**: The `Benchmark` class reuses `ProfilerComplete` from Module 15 for individual model measurements, then adds statistical comparison across multiple models. This demonstrates proper systems architecture - build once, reuse everywhere!
**Key Architectural Decision**: The `Benchmark` class reuses `Profiler` from Module 15 for individual model measurements, then adds statistical comparison across multiple models. This demonstrates proper systems architecture - build once, reuse everywhere!
Each level adds capability while maintaining statistical rigor at the foundation.
"""
@@ -521,8 +521,8 @@ class Benchmark:
self.measurement_runs = measurement_runs
self.results = {}
# Use ProfilerComplete from Module 15 for measurements
self.profiler = ProfilerComplete()
# Use Profiler from Module 15 for measurements
self.profiler = Profiler()
# System information for metadata
self.system_info = {
@@ -534,7 +534,7 @@ class Benchmark:
}
def run_latency_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]:
"""Benchmark model inference latency using ProfilerComplete."""
"""Benchmark model inference latency using Profiler."""
results = {}
for i, model in enumerate(self.models):
@@ -548,7 +548,7 @@ class Benchmark:
# Fallback for simple models
input_tensor = np.random.randn(*input_shape).astype(np.float32)
# Use ProfilerComplete to measure latency with proper warmup and iterations
# Use Profiler to measure latency with proper warmup and iterations
try:
latency_ms = self.profiler.measure_latency(
model,
@@ -557,7 +557,7 @@ class Benchmark:
iterations=self.measurement_runs
)
# ProfilerComplete returns single median value
# Profiler returns single median value
# For BenchmarkResult, we need multiple measurements
# Run additional measurements for statistical analysis
latencies = []
@@ -628,7 +628,7 @@ class Benchmark:
return results
def run_memory_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]:
"""Benchmark model memory usage using ProfilerComplete."""
"""Benchmark model memory usage using Profiler."""
results = {}
for i, model in enumerate(self.models):
@@ -637,7 +637,7 @@ class Benchmark:
for run in range(self.measurement_runs):
try:
# Use ProfilerComplete to measure memory
# Use Profiler to measure memory
memory_stats = self.profiler.measure_memory(model, input_shape)
# Use peak_memory_mb as the primary metric
memory_used = memory_stats['peak_memory_mb']