diff --git a/modules/source/16_acceleration/acceleration_dev.py b/modules/source/16_acceleration/acceleration_dev.py index 94bd8206..7e2fcdb1 100644 --- a/modules/source/16_acceleration/acceleration_dev.py +++ b/modules/source/16_acceleration/acceleration_dev.py @@ -880,6 +880,103 @@ def analyze_acceleration_decision_framework(): analyze_acceleration_decision_framework() +# %% [markdown] +""" +## 5.5 Measuring Acceleration Gains with Profiler + +Now let's use the **Profiler** tool you built in Module 15 to measure the actual performance improvements from vectorization. This demonstrates the full workflow: build profiling tools (M15), apply optimizations (M16), measure gains (M15+M16). + +This is how professional ML engineers work: profile → optimize → measure → repeat. +""" + +# %% nbgrader={"grade": false, "grade_id": "demo-profiler-acceleration", "solution": true} +# Import Profiler from Module 15 +from tinytorch.profiling.profiler import Profiler + +def demo_acceleration_with_profiler(): + """šŸ“Š Demonstrate acceleration gains using Profiler from Module 15.""" + print("šŸ“Š Measuring Acceleration Gains with Profiler") + print("=" * 70) + + profiler = Profiler() + + # Create two simple models: one slow (loop-based), one fast (vectorized) + class SlowLinear: + """Linear layer using explicit loops (slow).""" + def __init__(self, in_features, out_features): + self.weight = Tensor(np.random.randn(in_features, out_features).astype(np.float32) * 0.01) + self.name = "slow_linear" + + def forward(self, x): + # Explicit loop implementation (for demonstration) + batch_size = x.shape[0] + out_features = self.weight.shape[1] + result = np.zeros((batch_size, out_features), dtype=np.float32) + + for i in range(batch_size): + for j in range(out_features): + for k in range(x.shape[1]): + result[i, j] += x.data[i, k] * self.weight.data[k, j] + + return Tensor(result) + + class FastLinear: + """Linear layer using vectorized matmul (fast).""" + def __init__(self, in_features, out_features): + self.weight = Tensor(np.random.randn(in_features, out_features).astype(np.float32) * 0.01) + self.name = "fast_linear" + + def forward(self, x): + # Vectorized implementation + return vectorized_matmul(x, self.weight) + + in_features, out_features = 128, 64 + batch_size = 32 + + # Create models + slow_model = SlowLinear(in_features, out_features) + fast_model = FastLinear(in_features, out_features) + + # Create input + input_tensor = Tensor(np.random.randn(batch_size, in_features).astype(np.float32)) + + print("\n🐢 BEFORE: Loop-based implementation") + print("-" * 70) + + # Measure slow model + slow_latency = profiler.measure_latency(slow_model, input_tensor, warmup=3, iterations=10) + slow_flops = profiler.count_flops(slow_model, (batch_size, in_features)) + + print(f" Latency: {slow_latency:.2f} ms") + print(f" FLOPs: {slow_flops:,}") + print(f" Throughput: {slow_flops / (slow_latency / 1000) / 1e9:.2f} GFLOP/s") + + print("\nšŸš€ AFTER: Vectorized implementation") + print("-" * 70) + + # Measure fast model + fast_latency = profiler.measure_latency(fast_model, input_tensor, warmup=3, iterations=10) + fast_flops = profiler.count_flops(fast_model, (batch_size, in_features)) + + print(f" Latency: {fast_latency:.2f} ms") + print(f" FLOPs: {fast_flops:,}") + print(f" Throughput: {fast_flops / (fast_latency / 1000) / 1e9:.2f} GFLOP/s") + + print("\nšŸ“ˆ ACCELERATION GAINS") + print("=" * 70) + speedup = slow_latency / fast_latency + print(f" Speedup: {speedup:.1f}x faster") + print(f" Time saved: {slow_latency - fast_latency:.2f} ms per inference") + print(f" Throughput improvement: {speedup:.1f}x more inferences/second") + + print("\nšŸ’” Key Insight:") + print(f" Vectorization with numpy.matmul leverages optimized BLAS libraries") + print(f" that use SIMD instructions and cache-friendly memory access patterns.") + print(f" This is why {speedup:.0f}x speedups are possible with the same FLOPs!") + print("\nāœ… This is the power of acceleration: same math, different execution!") + +demo_acceleration_with_profiler() + # %% [markdown] """ ## 6. Module Integration Test diff --git a/modules/source/19_benchmarking/benchmarking_dev.py b/modules/source/19_benchmarking/benchmarking_dev.py index 51de25f8..e91c6891 100644 --- a/modules/source/19_benchmarking/benchmarking_dev.py +++ b/modules/source/19_benchmarking/benchmarking_dev.py @@ -155,7 +155,7 @@ from contextlib import contextmanager import warnings # Import Profiler from Module 15 for measurement reuse -from tinytorch.profiling.profiler import ProfilerComplete +from tinytorch.profiling.profiler import Profiler # %% [markdown] """ @@ -165,18 +165,18 @@ We'll build a comprehensive benchmarking system that handles statistical analysi The architecture follows a hierarchical design: ``` -ProfilerComplete (Module 15) ← Base measurement tools +Profiler (Module 15) ← Base measurement tools ↓ BenchmarkResult ← Statistical container for measurements ↓ -Benchmark ← Uses ProfilerComplete + adds multi-model comparison +Benchmark ← Uses Profiler + adds multi-model comparison ↓ BenchmarkSuite ← Multi-metric comprehensive evaluation ↓ TinyMLPerf ← Standardized industry-style benchmarks ``` -**Key Architectural Decision**: The `Benchmark` class reuses `ProfilerComplete` from Module 15 for individual model measurements, then adds statistical comparison across multiple models. This demonstrates proper systems architecture - build once, reuse everywhere! +**Key Architectural Decision**: The `Benchmark` class reuses `Profiler` from Module 15 for individual model measurements, then adds statistical comparison across multiple models. This demonstrates proper systems architecture - build once, reuse everywhere! Each level adds capability while maintaining statistical rigor at the foundation. """ @@ -521,8 +521,8 @@ class Benchmark: self.measurement_runs = measurement_runs self.results = {} - # Use ProfilerComplete from Module 15 for measurements - self.profiler = ProfilerComplete() + # Use Profiler from Module 15 for measurements + self.profiler = Profiler() # System information for metadata self.system_info = { @@ -534,7 +534,7 @@ class Benchmark: } def run_latency_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]: - """Benchmark model inference latency using ProfilerComplete.""" + """Benchmark model inference latency using Profiler.""" results = {} for i, model in enumerate(self.models): @@ -548,7 +548,7 @@ class Benchmark: # Fallback for simple models input_tensor = np.random.randn(*input_shape).astype(np.float32) - # Use ProfilerComplete to measure latency with proper warmup and iterations + # Use Profiler to measure latency with proper warmup and iterations try: latency_ms = self.profiler.measure_latency( model, @@ -557,7 +557,7 @@ class Benchmark: iterations=self.measurement_runs ) - # ProfilerComplete returns single median value + # Profiler returns single median value # For BenchmarkResult, we need multiple measurements # Run additional measurements for statistical analysis latencies = [] @@ -628,7 +628,7 @@ class Benchmark: return results def run_memory_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]: - """Benchmark model memory usage using ProfilerComplete.""" + """Benchmark model memory usage using Profiler.""" results = {} for i, model in enumerate(self.models): @@ -637,7 +637,7 @@ class Benchmark: for run in range(self.measurement_runs): try: - # Use ProfilerComplete to measure memory + # Use Profiler to measure memory memory_stats = self.profiler.measure_memory(model, input_shape) # Use peak_memory_mb as the primary metric memory_used = memory_stats['peak_memory_mb']