diff --git a/modules/source/16_acceleration/acceleration_dev.py b/modules/source/16_acceleration/acceleration_dev.py
index 94bd8206..7e2fcdb1 100644
--- a/modules/source/16_acceleration/acceleration_dev.py
+++ b/modules/source/16_acceleration/acceleration_dev.py
@@ -880,6 +880,103 @@ def analyze_acceleration_decision_framework():
 
 analyze_acceleration_decision_framework()
 
+# %% [markdown]
+"""
+## 5.5 Measuring Acceleration Gains with Profiler
+
+Now let's use the **Profiler** tool you built in Module 15 to measure the actual performance improvements from vectorization. This demonstrates the full workflow: build profiling tools (M15), apply optimizations (M16), measure gains (M15+M16).
+
+This is how professional ML engineers work: profile → optimize → measure → repeat.
+"""
+
+# %% nbgrader={"grade": false, "grade_id": "demo-profiler-acceleration", "solution": true}
+# Import Profiler from Module 15
+from tinytorch.profiling.profiler import Profiler
+
+def demo_acceleration_with_profiler():
+    """📊 Demonstrate acceleration gains using Profiler from Module 15."""
+    print("📊 Measuring Acceleration Gains with Profiler")
+    print("=" * 70)
+    
+    profiler = Profiler()
+    
+    # Create two simple models: one slow (loop-based), one fast (vectorized)
+    class SlowLinear:
+        """Linear layer using explicit loops (slow)."""
+        def __init__(self, in_features, out_features):
+            self.weight = Tensor(np.random.randn(in_features, out_features).astype(np.float32) * 0.01)
+            self.name = "slow_linear"
+        
+        def forward(self, x):
+            # Explicit loop implementation (for demonstration)
+            batch_size = x.shape[0]
+            out_features = self.weight.shape[1]
+            result = np.zeros((batch_size, out_features), dtype=np.float32)
+            
+            for i in range(batch_size):
+                for j in range(out_features):
+                    for k in range(x.shape[1]):
+                        result[i, j] += x.data[i, k] * self.weight.data[k, j]
+            
+            return Tensor(result)
+    
+    class FastLinear:
+        """Linear layer using vectorized matmul (fast)."""
+        def __init__(self, in_features, out_features):
+            self.weight = Tensor(np.random.randn(in_features, out_features).astype(np.float32) * 0.01)
+            self.name = "fast_linear"
+        
+        def forward(self, x):
+            # Vectorized implementation
+            return vectorized_matmul(x, self.weight)
+    
+    in_features, out_features = 128, 64
+    batch_size = 32
+    
+    # Create models
+    slow_model = SlowLinear(in_features, out_features)
+    fast_model = FastLinear(in_features, out_features)
+    
+    # Create input
+    input_tensor = Tensor(np.random.randn(batch_size, in_features).astype(np.float32))
+    
+    print("\n🐢 BEFORE: Loop-based implementation")
+    print("-" * 70)
+    
+    # Measure slow model
+    slow_latency = profiler.measure_latency(slow_model, input_tensor, warmup=3, iterations=10)
+    slow_flops = profiler.count_flops(slow_model, (batch_size, in_features))
+    
+    print(f"   Latency: {slow_latency:.2f} ms")
+    print(f"   FLOPs: {slow_flops:,}")
+    print(f"   Throughput: {slow_flops / (slow_latency / 1000) / 1e9:.2f} GFLOP/s")
+    
+    print("\n🚀 AFTER: Vectorized implementation")
+    print("-" * 70)
+    
+    # Measure fast model
+    fast_latency = profiler.measure_latency(fast_model, input_tensor, warmup=3, iterations=10)
+    fast_flops = profiler.count_flops(fast_model, (batch_size, in_features))
+    
+    print(f"   Latency: {fast_latency:.2f} ms")
+    print(f"   FLOPs: {fast_flops:,}")
+    print(f"   Throughput: {fast_flops / (fast_latency / 1000) / 1e9:.2f} GFLOP/s")
+    
+    print("\n📈 ACCELERATION GAINS")
+    print("=" * 70)
+    speedup = slow_latency / fast_latency
+    print(f"   Speedup: {speedup:.1f}x faster")
+    print(f"   Time saved: {slow_latency - fast_latency:.2f} ms per inference")
+    print(f"   Throughput improvement: {speedup:.1f}x more inferences/second")
+    
+    print("\n💡 Key Insight:")
+    print(f"   Vectorization with numpy.matmul leverages optimized BLAS libraries")
+    print(f"   that use SIMD instructions and cache-friendly memory access patterns.")
+    print(f"   This is why {speedup:.0f}x speedups are possible with the same FLOPs!")
+    print("\n✅ This is the power of acceleration: same math, different execution!")
+
+demo_acceleration_with_profiler()
+
 # %% [markdown]
 """
 ## 6. Module Integration Test
diff --git a/modules/source/19_benchmarking/benchmarking_dev.py b/modules/source/19_benchmarking/benchmarking_dev.py
index 51de25f8..e91c6891 100644
--- a/modules/source/19_benchmarking/benchmarking_dev.py
+++ b/modules/source/19_benchmarking/benchmarking_dev.py
@@ -155,7 +155,7 @@ from contextlib import contextmanager
 import warnings
 
 # Import Profiler from Module 15 for measurement reuse
-from tinytorch.profiling.profiler import ProfilerComplete
+from tinytorch.profiling.profiler import Profiler
 
 # %% [markdown]
 """
@@ -165,18 +165,18 @@ We'll build a comprehensive benchmarking system that handles statistical analysi
 
 The architecture follows a hierarchical design:
 ```
-ProfilerComplete (Module 15) ← Base measurement tools
+Profiler (Module 15) ← Base measurement tools
        ↓
 BenchmarkResult ← Statistical container for measurements
        ↓
-Benchmark ← Uses ProfilerComplete + adds multi-model comparison
+Benchmark ← Uses Profiler + adds multi-model comparison
        ↓
 BenchmarkSuite ← Multi-metric comprehensive evaluation
        ↓
 TinyMLPerf ← Standardized industry-style benchmarks
 ```
 
-**Key Architectural Decision**: The `Benchmark` class reuses `ProfilerComplete` from Module 15 for individual model measurements, then adds statistical comparison across multiple models. This demonstrates proper systems architecture - build once, reuse everywhere!
+**Key Architectural Decision**: The `Benchmark` class reuses `Profiler` from Module 15 for individual model measurements, then adds statistical comparison across multiple models. This demonstrates proper systems architecture - build once, reuse everywhere!
 
 Each level adds capability while maintaining statistical rigor at the foundation.
 """
@@ -521,8 +521,8 @@ class Benchmark:
         self.measurement_runs = measurement_runs
         self.results = {}
         
-        # Use ProfilerComplete from Module 15 for measurements
-        self.profiler = ProfilerComplete()
+        # Use Profiler from Module 15 for measurements
+        self.profiler = Profiler()
 
         # System information for metadata
         self.system_info = {
@@ -534,7 +534,7 @@ class Benchmark:
         }
 
     def run_latency_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]:
-        """Benchmark model inference latency using ProfilerComplete."""
+        """Benchmark model inference latency using Profiler."""
         results = {}
 
         for i, model in enumerate(self.models):
@@ -548,7 +548,7 @@ class Benchmark:
                 # Fallback for simple models
                 input_tensor = np.random.randn(*input_shape).astype(np.float32)
 
-            # Use ProfilerComplete to measure latency with proper warmup and iterations
+            # Use Profiler to measure latency with proper warmup and iterations
             try:
                 latency_ms = self.profiler.measure_latency(
                     model, 
@@ -557,7 +557,7 @@ class Benchmark:
                     iterations=self.measurement_runs
                 )
                 
-                # ProfilerComplete returns single median value
+                # Profiler returns single median value
                 # For BenchmarkResult, we need multiple measurements
                 # Run additional measurements for statistical analysis
                 latencies = []
@@ -628,7 +628,7 @@ class Benchmark:
         return results
 
     def run_memory_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]:
-        """Benchmark model memory usage using ProfilerComplete."""
+        """Benchmark model memory usage using Profiler."""
         results = {}
 
         for i, model in enumerate(self.models):
@@ -637,7 +637,7 @@ class Benchmark:
 
             for run in range(self.measurement_runs):
                 try:
-                    # Use ProfilerComplete to measure memory
+                    # Use Profiler to measure memory
                     memory_stats = self.profiler.measure_memory(model, input_shape)
                     # Use peak_memory_mb as the primary metric
                     memory_used = memory_stats['peak_memory_mb']