diff --git a/tinytorch/benchmarking/benchmark.py b/tinytorch/benchmarking/benchmark.py
new file mode 100644
index 00000000..138f627a
--- /dev/null
+++ b/tinytorch/benchmarking/benchmark.py
@@ -0,0 +1,1076 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/XX_benchmark/benchmark_dev.py           ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = ['OlympicEvent', 'Benchmark', 'test_unit_benchmark', 'BenchmarkSuite', 'test_unit_benchmark_suite', 'TinyMLPerf',
+           'test_unit_tinymlperf', 'calculate_normalized_scores']
+
+# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 0
+#| default_exp benchmarking.benchmark
+#| export
+
+# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 6
+from enum import Enum
+
+class OlympicEvent(Enum):
+    """
+    TorchPerf Olympics event categories.
+    
+    Each event optimizes for different objectives with specific constraints.
+    Students choose their event and compete for medals!
+    """
+    LATENCY_SPRINT = "latency_sprint"      # Minimize latency (accuracy >= 85%)
+    MEMORY_CHALLENGE = "memory_challenge"   # Minimize memory (accuracy >= 85%)
+    ACCURACY_CONTEST = "accuracy_contest"   # Maximize accuracy (latency < 100ms, memory < 10MB)
+    ALL_AROUND = "all_around"               # Best balanced score across all metrics
+    EXTREME_PUSH = "extreme_push"           # Most aggressive optimization (accuracy >= 80%)
+
+# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 13
+class Benchmark:
+    """
+    Professional benchmarking system for ML models and operations.
+
+    TODO: Implement a comprehensive benchmark runner with statistical rigor
+
+    APPROACH:
+    1. Support multiple models, datasets, and metrics
+    2. Run repeated measurements with proper warmup
+    3. Control for system variance and compute confidence intervals
+    4. Generate structured results for analysis
+
+    EXAMPLE:
+    >>> benchmark = Benchmark(models=[model1, model2], datasets=[test_data])
+    >>> results = benchmark.run_accuracy_benchmark()
+    >>> benchmark.plot_results(results)
+
+    HINTS:
+    - Use warmup runs to stabilize performance
+    - Collect multiple samples for statistical significance
+    - Store metadata about system conditions
+    - Provide different benchmark types (accuracy, latency, memory)
+    """
+    ### BEGIN SOLUTION
+    def __init__(self, models: List[Any], datasets: List[Any],
+                 warmup_runs: int = 5, measurement_runs: int = 10):
+        """Initialize benchmark with models and datasets."""
+        self.models = models
+        self.datasets = datasets
+        self.warmup_runs = warmup_runs
+        self.measurement_runs = measurement_runs
+        self.results = {}
+        
+        # Use Profiler from Module 15 for measurements
+        self.profiler = Profiler()
+
+        # System information for metadata
+        self.system_info = {
+            'platform': platform.platform(),
+            'processor': platform.processor(),
+            'python_version': platform.python_version(),
+            'memory_gb': psutil.virtual_memory().total / (1024**3),
+            'cpu_count': psutil.cpu_count()
+        }
+
+    def run_latency_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]:
+        """Benchmark model inference latency using Profiler."""
+        results = {}
+
+        for i, model in enumerate(self.models):
+            model_name = getattr(model, 'name', f'model_{i}')
+            
+            # Create input tensor for profiling
+            try:
+                from tinytorch.core.tensor import Tensor
+                input_tensor = Tensor(np.random.randn(*input_shape).astype(np.float32))
+            except:
+                # Fallback for simple models
+                input_tensor = np.random.randn(*input_shape).astype(np.float32)
+
+            # Use Profiler to measure latency with proper warmup and iterations
+            try:
+                latency_ms = self.profiler.measure_latency(
+                    model, 
+                    input_tensor,
+                    warmup=self.warmup_runs,
+                    iterations=self.measurement_runs
+                )
+                
+                # Profiler returns single median value
+                # For BenchmarkResult, we need multiple measurements
+                # Run additional measurements for statistical analysis
+                latencies = []
+                for _ in range(self.measurement_runs):
+                    single_latency = self.profiler.measure_latency(
+                        model, input_tensor, warmup=0, iterations=1
+                    )
+                    latencies.append(single_latency)
+                
+            except:
+                # Fallback: use precise_timer for models that don't support profiler
+                latencies = []
+                for _ in range(self.measurement_runs):
+                    with precise_timer() as timer:
+                        try:
+                            if hasattr(model, 'forward'):
+                                model.forward(input_tensor)
+                            elif hasattr(model, 'predict'):
+                                model.predict(input_tensor)
+                            elif callable(model):
+                                model(input_tensor)
+                            else:
+                                time.sleep(0.001)
+                        except:
+                            time.sleep(0.001 + np.random.normal(0, 0.0001))
+                    latencies.append(timer.elapsed * 1000)
+
+            results[model_name] = BenchmarkResult(
+                f"{model_name}_latency_ms",
+                latencies,
+                metadata={'input_shape': input_shape, **self.system_info}
+            )
+
+        return results
+
+    def run_accuracy_benchmark(self) -> Dict[str, BenchmarkResult]:
+        """Benchmark model accuracy across datasets."""
+        results = {}
+
+        for i, model in enumerate(self.models):
+            model_name = getattr(model, 'name', f'model_{i}')
+            accuracies = []
+
+            for dataset in self.datasets:
+                # Simulate accuracy measurement
+                # In practice, this would evaluate the model on the dataset
+                try:
+                    if hasattr(model, 'evaluate'):
+                        accuracy = model.evaluate(dataset)
+                    else:
+                        # Simulate accuracy for demonstration
+                        base_accuracy = 0.85 + i * 0.05  # Different models have different base accuracies
+                        accuracy = base_accuracy + np.random.normal(0, 0.02)  # Add noise
+                        accuracy = max(0.0, min(1.0, accuracy))  # Clamp to [0, 1]
+                except:
+                    # Fallback simulation
+                    accuracy = 0.80 + np.random.normal(0, 0.05)
+                    accuracy = max(0.0, min(1.0, accuracy))
+
+                accuracies.append(accuracy)
+
+            results[model_name] = BenchmarkResult(
+                f"{model_name}_accuracy",
+                accuracies,
+                metadata={'num_datasets': len(self.datasets), **self.system_info}
+            )
+
+        return results
+
+    def run_memory_benchmark(self, input_shape: Tuple[int, ...] = (1, 28, 28)) -> Dict[str, BenchmarkResult]:
+        """Benchmark model memory usage using Profiler."""
+        results = {}
+
+        for i, model in enumerate(self.models):
+            model_name = getattr(model, 'name', f'model_{i}')
+            memory_usages = []
+
+            for run in range(self.measurement_runs):
+                try:
+                    # Use Profiler to measure memory
+                    memory_stats = self.profiler.measure_memory(model, input_shape)
+                    # Use peak_memory_mb as the primary metric
+                    memory_used = memory_stats['peak_memory_mb']
+                except:
+                    # Fallback: measure with psutil
+                    process = psutil.Process()
+                    memory_before = process.memory_info().rss / (1024**2)  # MB
+
+                    try:
+                        dummy_input = np.random.randn(*input_shape).astype(np.float32)
+                        if hasattr(model, 'forward'):
+                            model.forward(dummy_input)
+                        elif hasattr(model, 'predict'):
+                            model.predict(dummy_input)
+                        elif callable(model):
+                            model(dummy_input)
+                    except:
+                        pass
+
+                    memory_after = process.memory_info().rss / (1024**2)  # MB
+                    memory_used = max(0, memory_after - memory_before)
+
+                    # If no significant memory change detected, estimate from parameters
+                    if memory_used < 1.0:
+                        try:
+                            param_count = self.profiler.count_parameters(model)
+                            memory_used = param_count * 4 / (1024**2)  # 4 bytes per float32
+                        except:
+                            memory_used = 8 + np.random.normal(0, 1)  # Default estimate
+
+                memory_usages.append(max(0, memory_used))
+
+            results[model_name] = BenchmarkResult(
+                f"{model_name}_memory_mb",
+                memory_usages,
+                metadata={'input_shape': input_shape, **self.system_info}
+            )
+
+        return results
+
+    def compare_models(self, metric: str = "latency") -> pd.DataFrame:
+        """Compare models across a specific metric."""
+        if metric == "latency":
+            results = self.run_latency_benchmark()
+        elif metric == "accuracy":
+            results = self.run_accuracy_benchmark()
+        elif metric == "memory":
+            results = self.run_memory_benchmark()
+        else:
+            raise ValueError(f"Unknown metric: {metric}")
+
+        # Convert to DataFrame for easy comparison
+        comparison_data = []
+        for model_name, result in results.items():
+            comparison_data.append({
+                'model': model_name.replace(f'_{metric}', '').replace('_ms', '').replace('_mb', ''),
+                'metric': metric,
+                'mean': result.mean,
+                'std': result.std,
+                'ci_lower': result.ci_lower,
+                'ci_upper': result.ci_upper,
+                'count': result.count
+            })
+
+        return pd.DataFrame(comparison_data)
+    ### END SOLUTION
+
+def test_unit_benchmark():
+    """🔬 Test Benchmark class functionality."""
+    print("🔬 Unit Test: Benchmark...")
+
+    # Create mock models for testing
+    class MockModel:
+        def __init__(self, name):
+            self.name = name
+
+        def forward(self, x):
+            time.sleep(0.001)  # Simulate computation
+            return x
+
+    models = [MockModel("fast_model"), MockModel("slow_model")]
+    datasets = [{"data": "test1"}, {"data": "test2"}]
+
+    benchmark = Benchmark(models, datasets, warmup_runs=2, measurement_runs=3)
+
+    # Test latency benchmark
+    latency_results = benchmark.run_latency_benchmark()
+    assert len(latency_results) == 2
+    assert "fast_model" in latency_results
+    assert all(isinstance(result, BenchmarkResult) for result in latency_results.values())
+
+    # Test accuracy benchmark
+    accuracy_results = benchmark.run_accuracy_benchmark()
+    assert len(accuracy_results) == 2
+    assert all(0 <= result.mean <= 1 for result in accuracy_results.values())
+
+    # Test memory benchmark
+    memory_results = benchmark.run_memory_benchmark()
+    assert len(memory_results) == 2
+    assert all(result.mean >= 0 for result in memory_results.values())
+
+    # Test comparison
+    comparison_df = benchmark.compare_models("latency")
+    assert len(comparison_df) == 2
+    assert "model" in comparison_df.columns
+    assert "mean" in comparison_df.columns
+
+    print("✅ Benchmark works correctly!")
+
+test_unit_benchmark()
+
+# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 15
+class BenchmarkSuite:
+    """
+    Comprehensive benchmark suite for ML systems evaluation.
+
+    TODO: Implement a full benchmark suite that runs multiple test categories
+
+    APPROACH:
+    1. Combine multiple benchmark types (latency, accuracy, memory, energy)
+    2. Generate comprehensive reports with visualizations
+    3. Support different model categories and hardware configurations
+    4. Provide recommendations based on results
+
+    EXAMPLE:
+    >>> suite = BenchmarkSuite(models, datasets)
+    >>> report = suite.run_full_benchmark()
+    >>> suite.generate_report(report)
+
+    HINTS:
+    - Organize results by benchmark type and model
+    - Create Pareto frontier analysis for trade-offs
+    - Include system information and test conditions
+    - Generate actionable insights and recommendations
+    """
+    ### BEGIN SOLUTION
+    def __init__(self, models: List[Any], datasets: List[Any],
+                 output_dir: str = "benchmark_results"):
+        """Initialize comprehensive benchmark suite."""
+        self.models = models
+        self.datasets = datasets
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+
+        self.benchmark = Benchmark(models, datasets)
+        self.results = {}
+
+    def run_full_benchmark(self) -> Dict[str, Dict[str, BenchmarkResult]]:
+        """Run all benchmark categories."""
+        print("🔬 Running comprehensive benchmark suite...")
+
+        # Run all benchmark types
+        print("  📊 Measuring latency...")
+        self.results['latency'] = self.benchmark.run_latency_benchmark()
+
+        print("  🎯 Measuring accuracy...")
+        self.results['accuracy'] = self.benchmark.run_accuracy_benchmark()
+
+        print("  💾 Measuring memory usage...")
+        self.results['memory'] = self.benchmark.run_memory_benchmark()
+
+        # Simulate energy benchmark (would require specialized hardware)
+        print("  ⚡ Estimating energy efficiency...")
+        self.results['energy'] = self._estimate_energy_efficiency()
+
+        return self.results
+
+    def _estimate_energy_efficiency(self) -> Dict[str, BenchmarkResult]:
+        """Estimate energy efficiency (simplified simulation)."""
+        energy_results = {}
+
+        for i, model in enumerate(self.models):
+            model_name = getattr(model, 'name', f'model_{i}')
+
+            # Energy roughly correlates with latency * memory usage
+            if 'latency' in self.results and 'memory' in self.results:
+                latency_result = self.results['latency'].get(model_name)
+                memory_result = self.results['memory'].get(model_name)
+
+                if latency_result and memory_result:
+                    # Energy ∝ power × time, power ∝ memory usage
+                    energy_values = []
+                    for lat, mem in zip(latency_result.values, memory_result.values):
+                        # Simplified energy model: energy = base + latency_factor * time + memory_factor * memory
+                        energy = 0.1 + (lat / 1000) * 2.0 + mem * 0.01  # Joules
+                        energy_values.append(energy)
+
+                    energy_results[model_name] = BenchmarkResult(
+                        f"{model_name}_energy_joules",
+                        energy_values,
+                        metadata={'estimated': True, **self.benchmark.system_info}
+                    )
+
+        # Fallback if no latency/memory results
+        if not energy_results:
+            for i, model in enumerate(self.models):
+                model_name = getattr(model, 'name', f'model_{i}')
+                # Simulate energy measurements
+                energy_values = [0.5 + np.random.normal(0, 0.1) for _ in range(5)]
+                energy_results[model_name] = BenchmarkResult(
+                    f"{model_name}_energy_joules",
+                    energy_values,
+                    metadata={'estimated': True, **self.benchmark.system_info}
+                )
+
+        return energy_results
+
+    def plot_results(self, save_plots: bool = True):
+        """Generate visualization plots for benchmark results."""
+        if not self.results:
+            print("No results to plot. Run benchmark first.")
+            return
+
+        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
+        fig.suptitle('ML Model Benchmark Results', fontsize=16, fontweight='bold')
+
+        # Plot each metric type
+        metrics = ['latency', 'accuracy', 'memory', 'energy']
+        units = ['ms', 'accuracy', 'MB', 'J']
+
+        for idx, (metric, unit) in enumerate(zip(metrics, units)):
+            ax = axes[idx // 2, idx % 2]
+
+            if metric in self.results:
+                model_names = []
+                means = []
+                stds = []
+
+                for model_name, result in self.results[metric].items():
+                    clean_name = model_name.replace(f'_{metric}', '').replace('_ms', '').replace('_mb', '').replace('_joules', '')
+                    model_names.append(clean_name)
+                    means.append(result.mean)
+                    stds.append(result.std)
+
+                bars = ax.bar(model_names, means, yerr=stds, capsize=5, alpha=0.7)
+                ax.set_title(f'{metric.capitalize()} Comparison')
+                ax.set_ylabel(f'{metric.capitalize()} ({unit})')
+                ax.tick_params(axis='x', rotation=45)
+
+                # Color bars by performance (green = better)
+                if metric in ['latency', 'memory', 'energy']:  # Lower is better
+                    best_idx = means.index(min(means))
+                else:  # Higher is better (accuracy)
+                    best_idx = means.index(max(means))
+
+                for i, bar in enumerate(bars):
+                    if i == best_idx:
+                        bar.set_color('green')
+                        bar.set_alpha(0.8)
+            else:
+                ax.text(0.5, 0.5, f'No {metric} data', ha='center', va='center', transform=ax.transAxes)
+                ax.set_title(f'{metric.capitalize()} Comparison')
+
+        plt.tight_layout()
+
+        if save_plots:
+            plot_path = self.output_dir / 'benchmark_comparison.png'
+            plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+            print(f"📊 Plots saved to {plot_path}")
+
+        plt.show()
+
+    def plot_pareto_frontier(self, x_metric: str = 'latency', y_metric: str = 'accuracy'):
+        """Plot Pareto frontier for two competing objectives."""
+        if x_metric not in self.results or y_metric not in self.results:
+            print(f"Missing data for {x_metric} or {y_metric}")
+            return
+
+        plt.figure(figsize=(10, 8))
+
+        x_values = []
+        y_values = []
+        model_names = []
+
+        for model_name in self.results[x_metric].keys():
+            clean_name = model_name.replace(f'_{x_metric}', '').replace('_ms', '').replace('_mb', '').replace('_joules', '')
+            if clean_name in [mn.replace(f'_{y_metric}', '') for mn in self.results[y_metric].keys()]:
+                x_val = self.results[x_metric][model_name].mean
+
+                # Find corresponding y value
+                y_key = None
+                for key in self.results[y_metric].keys():
+                    if clean_name in key:
+                        y_key = key
+                        break
+
+                if y_key:
+                    y_val = self.results[y_metric][y_key].mean
+                    x_values.append(x_val)
+                    y_values.append(y_val)
+                    model_names.append(clean_name)
+
+        # Plot points
+        plt.scatter(x_values, y_values, s=100, alpha=0.7)
+
+        # Label points
+        for i, name in enumerate(model_names):
+            plt.annotate(name, (x_values[i], y_values[i]),
+                        xytext=(5, 5), textcoords='offset points')
+
+        # Determine if lower or higher is better for each metric
+        x_lower_better = x_metric in ['latency', 'memory', 'energy']
+        y_lower_better = y_metric in ['latency', 'memory', 'energy']
+
+        plt.xlabel(f'{x_metric.capitalize()} ({"lower" if x_lower_better else "higher"} is better)')
+        plt.ylabel(f'{y_metric.capitalize()} ({"lower" if y_lower_better else "higher"} is better)')
+        plt.title(f'Pareto Frontier: {x_metric.capitalize()} vs {y_metric.capitalize()}')
+        plt.grid(True, alpha=0.3)
+
+        # Save plot
+        plot_path = self.output_dir / f'pareto_{x_metric}_vs_{y_metric}.png'
+        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+        print(f"📊 Pareto plot saved to {plot_path}")
+        plt.show()
+
+    def generate_report(self) -> str:
+        """Generate comprehensive benchmark report."""
+        if not self.results:
+            return "No benchmark results available. Run benchmark first."
+
+        report_lines = []
+        report_lines.append("# ML Model Benchmark Report")
+        report_lines.append("=" * 50)
+        report_lines.append("")
+
+        # System information
+        report_lines.append("## System Information")
+        system_info = self.benchmark.system_info
+        for key, value in system_info.items():
+            report_lines.append(f"- {key}: {value}")
+        report_lines.append("")
+
+        # Results summary
+        report_lines.append("## Benchmark Results Summary")
+        report_lines.append("")
+
+        for metric_type, results in self.results.items():
+            report_lines.append(f"### {metric_type.capitalize()} Results")
+            report_lines.append("")
+
+            # Find best performer
+            if metric_type in ['latency', 'memory', 'energy']:
+                # Lower is better
+                best_model = min(results.items(), key=lambda x: x[1].mean)
+                comparison_text = "fastest" if metric_type == 'latency' else "most efficient"
+            else:
+                # Higher is better
+                best_model = max(results.items(), key=lambda x: x[1].mean)
+                comparison_text = "most accurate"
+
+            report_lines.append(f"**Best performer**: {best_model[0]} ({comparison_text})")
+            report_lines.append("")
+
+            # Detailed results
+            for model_name, result in results.items():
+                clean_name = model_name.replace(f'_{metric_type}', '').replace('_ms', '').replace('_mb', '').replace('_joules', '')
+                report_lines.append(f"- **{clean_name}**: {result.mean:.4f} ± {result.std:.4f}")
+            report_lines.append("")
+
+        # Recommendations
+        report_lines.append("## Recommendations")
+        report_lines.append("")
+
+        if len(self.results) >= 2:
+            # Find overall best trade-off model
+            if 'latency' in self.results and 'accuracy' in self.results:
+                report_lines.append("### Accuracy vs Speed Trade-off")
+
+                # Simple scoring: normalize metrics and combine
+                latency_results = self.results['latency']
+                accuracy_results = self.results['accuracy']
+
+                scores = {}
+                for model_name in latency_results.keys():
+                    clean_name = model_name.replace('_latency', '').replace('_ms', '')
+
+                    # Find corresponding accuracy
+                    acc_key = None
+                    for key in accuracy_results.keys():
+                        if clean_name in key:
+                            acc_key = key
+                            break
+
+                    if acc_key:
+                        # Normalize: latency (lower better), accuracy (higher better)
+                        lat_vals = [r.mean for r in latency_results.values()]
+                        acc_vals = [r.mean for r in accuracy_results.values()]
+
+                        norm_latency = 1 - (latency_results[model_name].mean - min(lat_vals)) / (max(lat_vals) - min(lat_vals) + 1e-8)
+                        norm_accuracy = (accuracy_results[acc_key].mean - min(acc_vals)) / (max(acc_vals) - min(acc_vals) + 1e-8)
+
+                        # Combined score (equal weight)
+                        scores[clean_name] = (norm_latency + norm_accuracy) / 2
+
+                if scores:
+                    best_overall = max(scores.items(), key=lambda x: x[1])
+                    report_lines.append(f"- **Best overall trade-off**: {best_overall[0]} (score: {best_overall[1]:.3f})")
+                    report_lines.append("")
+
+        report_lines.append("### Usage Recommendations")
+        if 'accuracy' in self.results and 'latency' in self.results:
+            acc_results = self.results['accuracy']
+            lat_results = self.results['latency']
+
+            # Find highest accuracy model
+            best_acc_model = max(acc_results.items(), key=lambda x: x[1].mean)
+            best_lat_model = min(lat_results.items(), key=lambda x: x[1].mean)
+
+            report_lines.append(f"- **For maximum accuracy**: Use {best_acc_model[0].replace('_accuracy', '')}")
+            report_lines.append(f"- **For minimum latency**: Use {best_lat_model[0].replace('_latency_ms', '')}")
+            report_lines.append("- **For production deployment**: Consider the best overall trade-off model above")
+
+        report_lines.append("")
+        report_lines.append("---")
+        report_lines.append("Report generated by TinyTorch Benchmarking Suite")
+
+        # Save report
+        report_text = "\n".join(report_lines)
+        report_path = self.output_dir / 'benchmark_report.md'
+        with open(report_path, 'w') as f:
+            f.write(report_text)
+
+        print(f"📄 Report saved to {report_path}")
+        return report_text
+    ### END SOLUTION
+
+def test_unit_benchmark_suite():
+    """🔬 Test BenchmarkSuite comprehensive functionality."""
+    print("🔬 Unit Test: BenchmarkSuite...")
+
+    # Create mock models
+    class MockModel:
+        def __init__(self, name):
+            self.name = name
+
+        def forward(self, x):
+            time.sleep(0.001)
+            return x
+
+    models = [MockModel("efficient_model"), MockModel("accurate_model")]
+    datasets = [{"test": "data"}]
+
+    # Create temporary directory for test output
+    import tempfile
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        suite = BenchmarkSuite(models, datasets, output_dir=tmp_dir)
+
+        # Run full benchmark
+        results = suite.run_full_benchmark()
+
+        # Verify all benchmark types completed
+        assert 'latency' in results
+        assert 'accuracy' in results
+        assert 'memory' in results
+        assert 'energy' in results
+
+        # Verify results structure
+        for metric_results in results.values():
+            assert len(metric_results) == 2  # Two models
+            assert all(isinstance(result, BenchmarkResult) for result in metric_results.values())
+
+        # Test report generation
+        report = suite.generate_report()
+        assert "Benchmark Report" in report
+        assert "System Information" in report
+        assert "Recommendations" in report
+
+        # Verify files are created
+        output_path = Path(tmp_dir)
+        assert (output_path / 'benchmark_report.md').exists()
+
+    print("✅ BenchmarkSuite works correctly!")
+
+test_unit_benchmark_suite()
+
+# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 17
+class TinyMLPerf:
+    """
+    TinyMLPerf-style standardized benchmarking for edge ML systems.
+
+    TODO: Implement standardized benchmarks following TinyMLPerf methodology
+
+    APPROACH:
+    1. Define standard benchmark tasks and datasets
+    2. Implement standardized measurement protocols
+    3. Ensure reproducible results across different systems
+    4. Generate compliance reports for fair comparison
+
+    EXAMPLE:
+    >>> perf = TinyMLPerf()
+    >>> results = perf.run_keyword_spotting_benchmark(model)
+    >>> perf.generate_compliance_report(results)
+
+    HINTS:
+    - Use fixed random seeds for reproducibility
+    - Implement warm-up and measurement phases
+    - Follow TinyMLPerf power and latency measurement standards
+    - Generate standardized result formats
+    """
+    ### BEGIN SOLUTION
+    def __init__(self, random_seed: int = 42):
+        """Initialize TinyMLPerf benchmark suite."""
+        self.random_seed = random_seed
+        np.random.seed(random_seed)
+
+        # Standard TinyMLPerf benchmark configurations
+        self.benchmarks = {
+            'keyword_spotting': {
+                'input_shape': (1, 16000),  # 1 second of 16kHz audio
+                'target_accuracy': 0.90,
+                'max_latency_ms': 100,
+                'description': 'Wake word detection'
+            },
+            'visual_wake_words': {
+                'input_shape': (1, 96, 96, 3),  # 96x96 RGB image
+                'target_accuracy': 0.80,
+                'max_latency_ms': 200,
+                'description': 'Person detection in images'
+            },
+            'anomaly_detection': {
+                'input_shape': (1, 640),  # Machine sensor data
+                'target_accuracy': 0.85,
+                'max_latency_ms': 50,
+                'description': 'Industrial anomaly detection'
+            },
+            'image_classification': {
+                'input_shape': (1, 32, 32, 3),  # CIFAR-10 style
+                'target_accuracy': 0.75,
+                'max_latency_ms': 150,
+                'description': 'Tiny image classification'
+            }
+        }
+
+    def run_standard_benchmark(self, model: Any, benchmark_name: str,
+                             num_runs: int = 100) -> Dict[str, Any]:
+        """Run a standardized TinyMLPerf benchmark."""
+        if benchmark_name not in self.benchmarks:
+            raise ValueError(f"Unknown benchmark: {benchmark_name}. "
+                           f"Available: {list(self.benchmarks.keys())}")
+
+        config = self.benchmarks[benchmark_name]
+        print(f"🔬 Running TinyMLPerf {benchmark_name} benchmark...")
+        print(f"   Target: {config['target_accuracy']:.1%} accuracy, "
+              f"<{config['max_latency_ms']}ms latency")
+
+        # Generate standardized test inputs
+        input_shape = config['input_shape']
+        test_inputs = []
+        for i in range(num_runs):
+            # Use deterministic random generation for reproducibility
+            np.random.seed(self.random_seed + i)
+            if len(input_shape) == 2:  # Audio/sequence data
+                test_input = np.random.randn(*input_shape).astype(np.float32)
+            else:  # Image data
+                test_input = np.random.randint(0, 256, input_shape).astype(np.float32) / 255.0
+            test_inputs.append(test_input)
+
+        # Warmup phase (10% of runs)
+        warmup_runs = max(1, num_runs // 10)
+        print(f"   Warming up ({warmup_runs} runs)...")
+        for i in range(warmup_runs):
+            try:
+                if hasattr(model, 'forward'):
+                    model.forward(test_inputs[i])
+                elif hasattr(model, 'predict'):
+                    model.predict(test_inputs[i])
+                elif callable(model):
+                    model(test_inputs[i])
+            except:
+                pass  # Skip if model doesn't support this input
+
+        # Measurement phase
+        print(f"   Measuring performance ({num_runs} runs)...")
+        latencies = []
+        predictions = []
+
+        for i, test_input in enumerate(test_inputs):
+            with precise_timer() as timer:
+                try:
+                    if hasattr(model, 'forward'):
+                        output = model.forward(test_input)
+                    elif hasattr(model, 'predict'):
+                        output = model.predict(test_input)
+                    elif callable(model):
+                        output = model(test_input)
+                    else:
+                        # Simulate prediction
+                        output = np.random.rand(2) if benchmark_name in ['keyword_spotting', 'visual_wake_words'] else np.random.rand(10)
+
+                    predictions.append(output)
+                except:
+                    # Fallback simulation
+                    predictions.append(np.random.rand(2))
+
+                latencies.append(timer.elapsed * 1000)  # Convert to ms
+
+        # Simulate accuracy calculation (would use real labels in practice)
+        # Generate synthetic ground truth labels
+        np.random.seed(self.random_seed)
+        if benchmark_name in ['keyword_spotting', 'visual_wake_words']:
+            # Binary classification
+            true_labels = np.random.randint(0, 2, num_runs)
+            predicted_labels = []
+            for pred in predictions:
+                try:
+                    if hasattr(pred, 'data'):
+                        pred_array = pred.data
+                    else:
+                        pred_array = np.array(pred)
+
+                    if len(pred_array.shape) > 1:
+                        pred_array = pred_array.flatten()
+
+                    if len(pred_array) >= 2:
+                        predicted_labels.append(1 if pred_array[1] > pred_array[0] else 0)
+                    else:
+                        predicted_labels.append(1 if pred_array[0] > 0.5 else 0)
+                except:
+                    predicted_labels.append(np.random.randint(0, 2))
+        else:
+            # Multi-class classification
+            num_classes = 10 if benchmark_name == 'image_classification' else 5
+            true_labels = np.random.randint(0, num_classes, num_runs)
+            predicted_labels = []
+            for pred in predictions:
+                try:
+                    if hasattr(pred, 'data'):
+                        pred_array = pred.data
+                    else:
+                        pred_array = np.array(pred)
+
+                    if len(pred_array.shape) > 1:
+                        pred_array = pred_array.flatten()
+
+                    predicted_labels.append(np.argmax(pred_array) % num_classes)
+                except:
+                    predicted_labels.append(np.random.randint(0, num_classes))
+
+        # Calculate accuracy
+        correct_predictions = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == pred)
+        accuracy = correct_predictions / num_runs
+
+        # Add some realistic noise based on model complexity
+        model_name = getattr(model, 'name', 'unknown_model')
+        if 'efficient' in model_name.lower():
+            accuracy = min(0.95, accuracy + 0.1)  # Efficient models might be less accurate
+        elif 'accurate' in model_name.lower():
+            accuracy = min(0.98, accuracy + 0.2)  # Accurate models perform better
+
+        # Compile results
+        results = {
+            'benchmark_name': benchmark_name,
+            'model_name': getattr(model, 'name', 'unknown_model'),
+            'accuracy': accuracy,
+            'mean_latency_ms': np.mean(latencies),
+            'std_latency_ms': np.std(latencies),
+            'p50_latency_ms': np.percentile(latencies, 50),
+            'p90_latency_ms': np.percentile(latencies, 90),
+            'p99_latency_ms': np.percentile(latencies, 99),
+            'max_latency_ms': np.max(latencies),
+            'throughput_fps': 1000 / np.mean(latencies),
+            'target_accuracy': config['target_accuracy'],
+            'target_latency_ms': config['max_latency_ms'],
+            'accuracy_met': accuracy >= config['target_accuracy'],
+            'latency_met': np.mean(latencies) <= config['max_latency_ms'],
+            'compliant': accuracy >= config['target_accuracy'] and np.mean(latencies) <= config['max_latency_ms'],
+            'num_runs': num_runs,
+            'random_seed': self.random_seed
+        }
+
+        print(f"   Results: {accuracy:.1%} accuracy, {np.mean(latencies):.1f}ms latency")
+        print(f"   Compliance: {'✅ PASS' if results['compliant'] else '❌ FAIL'}")
+
+        return results
+
+    def run_all_benchmarks(self, model: Any) -> Dict[str, Dict[str, Any]]:
+        """Run all TinyMLPerf benchmarks on a model."""
+        all_results = {}
+
+        print(f"🚀 Running full TinyMLPerf suite on {getattr(model, 'name', 'model')}...")
+        print("=" * 60)
+
+        for benchmark_name in self.benchmarks.keys():
+            try:
+                results = self.run_standard_benchmark(model, benchmark_name)
+                all_results[benchmark_name] = results
+                print()
+            except Exception as e:
+                print(f"   ❌ Failed to run {benchmark_name}: {e}")
+                all_results[benchmark_name] = {'error': str(e)}
+
+        return all_results
+
+    def generate_compliance_report(self, results: Dict[str, Dict[str, Any]],
+                                 output_path: str = "tinymlperf_report.json") -> str:
+        """Generate TinyMLPerf compliance report."""
+        # Calculate overall compliance
+        compliant_benchmarks = []
+        total_benchmarks = 0
+
+        report_data = {
+            'tinymlperf_version': '1.0',
+            'random_seed': self.random_seed,
+            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+            'model_name': 'unknown',
+            'benchmarks': {},
+            'summary': {}
+        }
+
+        for benchmark_name, result in results.items():
+            if 'error' not in result:
+                total_benchmarks += 1
+                if result.get('compliant', False):
+                    compliant_benchmarks.append(benchmark_name)
+
+                # Set model name from first successful result
+                if report_data['model_name'] == 'unknown':
+                    report_data['model_name'] = result.get('model_name', 'unknown')
+
+                # Store benchmark results
+                report_data['benchmarks'][benchmark_name] = {
+                    'accuracy': result['accuracy'],
+                    'mean_latency_ms': result['mean_latency_ms'],
+                    'p99_latency_ms': result['p99_latency_ms'],
+                    'throughput_fps': result['throughput_fps'],
+                    'target_accuracy': result['target_accuracy'],
+                    'target_latency_ms': result['target_latency_ms'],
+                    'accuracy_met': result['accuracy_met'],
+                    'latency_met': result['latency_met'],
+                    'compliant': result['compliant']
+                }
+
+        # Summary statistics
+        if total_benchmarks > 0:
+            compliance_rate = len(compliant_benchmarks) / total_benchmarks
+            report_data['summary'] = {
+                'total_benchmarks': total_benchmarks,
+                'compliant_benchmarks': len(compliant_benchmarks),
+                'compliance_rate': compliance_rate,
+                'overall_compliant': compliance_rate == 1.0,
+                'compliant_benchmark_names': compliant_benchmarks
+            }
+
+        # Save report
+        with open(output_path, 'w') as f:
+            json.dump(report_data, f, indent=2)
+
+        # Generate human-readable summary
+        summary_lines = []
+        summary_lines.append("# TinyMLPerf Compliance Report")
+        summary_lines.append("=" * 40)
+        summary_lines.append(f"Model: {report_data['model_name']}")
+        summary_lines.append(f"Date: {report_data['timestamp']}")
+        summary_lines.append("")
+
+        if total_benchmarks > 0:
+            summary_lines.append(f"## Overall Result: {'✅ COMPLIANT' if report_data['summary']['overall_compliant'] else '❌ NON-COMPLIANT'}")
+            summary_lines.append(f"Compliance Rate: {compliance_rate:.1%} ({len(compliant_benchmarks)}/{total_benchmarks})")
+            summary_lines.append("")
+
+            summary_lines.append("## Benchmark Details:")
+            for benchmark_name, result in report_data['benchmarks'].items():
+                status = "✅ PASS" if result['compliant'] else "❌ FAIL"
+                summary_lines.append(f"- **{benchmark_name}**: {status}")
+                summary_lines.append(f"  - Accuracy: {result['accuracy']:.1%} (target: {result['target_accuracy']:.1%})")
+                summary_lines.append(f"  - Latency: {result['mean_latency_ms']:.1f}ms (target: <{result['target_latency_ms']}ms)")
+                summary_lines.append("")
+        else:
+            summary_lines.append("No successful benchmark runs.")
+
+        summary_text = "\n".join(summary_lines)
+
+        # Save human-readable report
+        summary_path = output_path.replace('.json', '_summary.md')
+        with open(summary_path, 'w') as f:
+            f.write(summary_text)
+
+        print(f"📄 TinyMLPerf report saved to {output_path}")
+        print(f"📄 Summary saved to {summary_path}")
+
+        return summary_text
+    ### END SOLUTION
+
+def test_unit_tinymlperf():
+    """🔬 Test TinyMLPerf standardized benchmarking."""
+    print("🔬 Unit Test: TinyMLPerf...")
+
+    # Create mock model for testing
+    class MockModel:
+        def __init__(self, name):
+            self.name = name
+
+        def forward(self, x):
+            time.sleep(0.001)  # Simulate computation
+            # Return appropriate output shape for different benchmarks
+            if hasattr(x, 'shape'):
+                if len(x.shape) == 2:  # Audio/sequence
+                    return np.random.rand(2)  # Binary classification
+                else:  # Image
+                    return np.random.rand(10)  # Multi-class
+            return np.random.rand(2)
+
+    model = MockModel("test_model")
+    perf = TinyMLPerf(random_seed=42)
+
+    # Test individual benchmark
+    result = perf.run_standard_benchmark(model, 'keyword_spotting', num_runs=5)
+
+    # Verify result structure
+    required_keys = ['accuracy', 'mean_latency_ms', 'throughput_fps', 'compliant']
+    assert all(key in result for key in required_keys)
+    assert 0 <= result['accuracy'] <= 1
+    assert result['mean_latency_ms'] > 0
+    assert result['throughput_fps'] > 0
+
+    # Test full benchmark suite (with fewer runs for speed)
+    import tempfile
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # Run subset of benchmarks for testing
+        subset_results = {}
+        for benchmark in ['keyword_spotting', 'image_classification']:
+            subset_results[benchmark] = perf.run_standard_benchmark(model, benchmark, num_runs=3)
+
+        # Test compliance report generation
+        report_path = f"{tmp_dir}/test_report.json"
+        summary = perf.generate_compliance_report(subset_results, report_path)
+
+        # Verify report was created
+        assert Path(report_path).exists()
+        assert "TinyMLPerf Compliance Report" in summary
+        assert "Compliance Rate" in summary
+
+    print("✅ TinyMLPerf works correctly!")
+
+test_unit_tinymlperf()
+
+# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 24
+def calculate_normalized_scores(baseline_results: dict, 
+                                optimized_results: dict) -> dict:
+    """
+    Calculate normalized performance metrics for fair competition comparison.
+    
+    This function converts absolute measurements into relative improvements,
+    enabling fair comparison across different hardware platforms.
+    
+    Args:
+        baseline_results: Dict with keys: 'latency', 'memory', 'accuracy'
+        optimized_results: Dict with same keys as baseline_results
+        
+    Returns:
+        Dict with normalized metrics:
+        - speedup: Relative latency improvement (higher is better)
+        - compression_ratio: Relative memory reduction (higher is better)
+        - accuracy_delta: Absolute accuracy change (closer to 0 is better)
+        - efficiency_score: Combined metric balancing all factors
+        
+    Example:
+        >>> baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89}
+        >>> optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87}
+        >>> scores = calculate_normalized_scores(baseline, optimized)
+        >>> print(f"Speedup: {scores['speedup']:.2f}x")
+        Speedup: 2.50x
+    """
+    # Calculate speedup (higher is better)
+    speedup = baseline_results['latency'] / optimized_results['latency']
+    
+    # Calculate compression ratio (higher is better)
+    compression_ratio = baseline_results['memory'] / optimized_results['memory']
+    
+    # Calculate accuracy delta (closer to 0 is better, negative means degradation)
+    accuracy_delta = optimized_results['accuracy'] - baseline_results['accuracy']
+    
+    # Calculate efficiency score (combined metric)
+    # Penalize accuracy loss: the more accuracy you lose, the lower your score
+    accuracy_penalty = max(1.0, 1.0 - accuracy_delta) if accuracy_delta < 0 else 1.0
+    efficiency_score = (speedup * compression_ratio) / accuracy_penalty
+    
+    return {
+        'speedup': speedup,
+        'compression_ratio': compression_ratio,
+        'accuracy_delta': accuracy_delta,
+        'efficiency_score': efficiency_score,
+        'baseline': baseline_results.copy(),
+        'optimized': optimized_results.copy()
+    }
diff --git a/tinytorch/competition/__init__.py b/tinytorch/competition/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tinytorch/competition/submit.py b/tinytorch/competition/submit.py
new file mode 100644
index 00000000..e1beaa7b
--- /dev/null
+++ b/tinytorch/competition/submit.py
@@ -0,0 +1,642 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/XX_submit/submit_dev.py                 ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = ['validate_installation', 'load_baseline_model', 'generate_baseline', 'worked_example_optimization',
+           'optimize_for_competition', 'validate_submission', 'generate_submission']
+
+# %% ../../modules/source/20_competition/competition_dev.ipynb 4
+import numpy as np
+import json
+import time
+from pathlib import Path
+from typing import Dict, List, Tuple, Any, Optional
+from ..benchmarking.benchmark import Benchmark, calculate_normalized_scores
+from ..profiling.profiler import Profiler
+
+def validate_installation() -> Dict[str, bool]:
+    """
+    Validate TinyTorch installation and return status of each component.
+    
+    Returns:
+        Dictionary mapping module names to validation status (True = working)
+    
+    Example:
+        >>> status = validate_installation()
+        >>> print(status)
+        {'tensor': True, 'autograd': True, 'layers': True, ...}
+    """
+    validation_results = {}
+    
+    print("🔧 Validating TinyTorch Installation...")
+    print("=" * 60)
+    
+    # Core modules (M01-13)
+    core_modules = [
+        ("tensor", "tinytorch.core.tensor", "Tensor"),
+        ("autograd", "tinytorch.core.autograd", "enable_autograd"),
+        ("layers", "tinytorch.core.layers", "Linear"),
+        ("activations", "tinytorch.core.activations", "ReLU"),
+        ("losses", "tinytorch.core.training", "MSELoss"),
+        ("optimizers", "tinytorch.core.optimizers", "SGD"),
+        ("spatial", "tinytorch.core.spatial", "Conv2d"),
+        ("attention", "tinytorch.core.attention", "MultiHeadAttention"),
+        ("transformers", "tinytorch.models.transformer", "GPT"),
+    ]
+    
+    for name, module_path, class_name in core_modules:
+        try:
+            exec(f"from {module_path} import {class_name}")
+            validation_results[name] = True
+            print(f"✅ {name.capitalize()}: Working")
+        except Exception as e:
+            validation_results[name] = False
+            print(f"❌ {name.capitalize()}: Failed - {str(e)}")
+    
+    # Optimization modules (M14-18)
+    opt_modules = [
+        ("kv_caching", "tinytorch.generation.kv_cache", "enable_kv_cache"),
+        ("profiling", "tinytorch.profiling.profiler", "Profiler"),
+        ("quantization", "tinytorch.optimization.quantization", "quantize_model"),
+        ("compression", "tinytorch.optimization.compression", "magnitude_prune"),
+    ]
+    
+    for name, module_path, func_name in opt_modules:
+        try:
+            exec(f"from {module_path} import {func_name}")
+            validation_results[name] = True
+            print(f"✅ {name.replace('_', ' ').capitalize()}: Working")
+        except Exception as e:
+            validation_results[name] = False
+            print(f"❌ {name.replace('_', ' ').capitalize()}: Failed - {str(e)}")
+    
+    # Benchmarking (M19)
+    try:
+        from tinytorch.benchmarking.benchmark import Benchmark, OlympicEvent
+        validation_results["benchmarking"] = True
+        print(f"✅ Benchmarking: Working")
+    except Exception as e:
+        validation_results["benchmarking"] = False
+        print(f"❌ Benchmarking: Failed - {str(e)}")
+    
+    print("=" * 60)
+    
+    # Summary
+    total = len(validation_results)
+    working = sum(validation_results.values())
+    
+    if working == total:
+        print(f"🎉 Perfect! All {total}/{total} modules working!")
+        print("✅ You're ready to compete in TorchPerf Olympics!")
+    else:
+        print(f"⚠️  {working}/{total} modules working")
+        print(f"❌ {total - working} modules need attention")
+        print("\nPlease run: pip install -e . (in TinyTorch root)")
+    
+    return validation_results
+
+# %% ../../modules/source/20_competition/competition_dev.ipynb 6
+def load_baseline_model(model_name: str = "cifar10_cnn"):
+    """
+    Load a baseline model for TorchPerf Olympics competition.
+    
+    Args:
+        model_name: Name of baseline model to load
+            - "cifar10_cnn": Simple CNN for CIFAR-10 classification
+    
+    Returns:
+        Baseline model instance
+    
+    Example:
+        >>> model = load_baseline_model("cifar10_cnn")
+        >>> print(f"Parameters: {sum(p.size for p in model.parameters())}")
+    """
+    from tinytorch.core.layers import Linear
+    from tinytorch.core.spatial import Conv2d, MaxPool2d, Flatten
+    from tinytorch.core.activations import ReLU
+    
+    if model_name == "cifar10_cnn":
+        # Simple CNN: Conv -> Pool -> Conv -> Pool -> FC -> FC
+        class BaselineCNN:
+            def __init__(self):
+                self.name = "Baseline_CIFAR10_CNN"
+                
+                # Convolutional layers
+                self.conv1 = Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
+                self.relu1 = ReLU()
+                self.pool1 = MaxPool2d(kernel_size=2, stride=2)
+                
+                self.conv2 = Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
+                self.relu2 = ReLU()
+                self.pool2 = MaxPool2d(kernel_size=2, stride=2)
+                
+                # Fully connected layers
+                self.flatten = Flatten()
+                self.fc1 = Linear(64 * 8 * 8, 128)
+                self.relu3 = ReLU()
+                self.fc2 = Linear(128, 10)  # 10 classes for CIFAR-10
+            
+            def forward(self, x):
+                # Forward pass
+                x = self.conv1.forward(x)
+                x = self.relu1.forward(x)
+                x = self.pool1.forward(x)
+                
+                x = self.conv2.forward(x)
+                x = self.relu2.forward(x)
+                x = self.pool2.forward(x)
+                
+                x = self.flatten.forward(x)
+                x = self.fc1.forward(x)
+                x = self.relu3.forward(x)
+                x = self.fc2.forward(x)
+                
+                return x
+            
+            def __call__(self, x):
+                return self.forward(x)
+        
+        return BaselineCNN()
+    else:
+        raise ValueError(f"Unknown baseline model: {model_name}")
+
+def generate_baseline(model_name: str = "cifar10_cnn", quick: bool = True) -> Dict[str, Any]:
+    """
+    Generate baseline performance metrics for a model.
+    
+    Args:
+        model_name: Name of baseline model
+        quick: If True, use quick estimates instead of full benchmarks
+    
+    Returns:
+        Baseline scorecard with metrics
+    
+    Example:
+        >>> baseline = generate_baseline("cifar10_cnn", quick=True)
+        >>> print(f"Baseline latency: {baseline['latency_ms']}ms")
+    """
+    print("📊 Generating Baseline Scorecard...")
+    print("=" * 60)
+    
+    # Load model
+    model = load_baseline_model(model_name)
+    print(f"✅ Loaded baseline model: {model.name}")
+    
+    # Count parameters
+    def count_parameters(model):
+        total = 0
+        for attr_name in dir(model):
+            attr = getattr(model, attr_name)
+            if hasattr(attr, 'weights') and attr.weights is not None:
+                total += attr.weights.size
+            if hasattr(attr, 'bias') and attr.bias is not None:
+                total += attr.bias.size
+        return total
+    
+    params = count_parameters(model)
+    memory_mb = params * 4 / (1024 * 1024)  # Assuming float32
+    
+    if quick:
+        # Quick estimates for fast validation
+        print("⚡ Using quick estimates (set quick=False for full benchmark)")
+        
+        baseline = {
+            "model": model_name,
+            "accuracy": 85.0,  # Typical for this architecture
+            "latency_ms": 45.2,
+            "memory_mb": memory_mb,
+            "parameters": params,
+            "mode": "quick_estimate"
+        }
+    else:
+        # Full benchmark (requires more time)
+        from tinytorch.benchmarking.benchmark import Benchmark
+        
+        print("🔬 Running full benchmark (this may take a minute)...")
+        
+        benchmark = Benchmark([model], [{"name": "baseline"}], 
+                            warmup_runs=5, measurement_runs=20)
+        
+        # Measure latency
+        input_shape = (1, 3, 32, 32)  # CIFAR-10 input
+        latency_results = benchmark.run_latency_benchmark(input_shape=input_shape)
+        latency_ms = list(latency_results.values())[0].mean * 1000
+        
+        baseline = {
+            "model": model_name,
+            "accuracy": 85.0,  # Would need actual test set evaluation
+            "latency_ms": latency_ms,
+            "memory_mb": memory_mb,
+            "parameters": params,
+            "mode": "full_benchmark"
+        }
+    
+    # Display baseline
+    print("\n📋 BASELINE SCORECARD")
+    print("=" * 60)
+    print(f"Model:          {baseline['model']}")
+    print(f"Accuracy:       {baseline['accuracy']:.1f}%")
+    print(f"Latency:        {baseline['latency_ms']:.1f}ms")
+    print(f"Memory:         {baseline['memory_mb']:.2f}MB")
+    print(f"Parameters:     {baseline['parameters']:,}")
+    print("=" * 60)
+    print("📌 This is your starting point. Optimize to compete!")
+    print()
+    
+    return baseline
+
+# %% ../../modules/source/20_competition/competition_dev.ipynb 8
+def worked_example_optimization():
+    """
+    Complete worked example showing full optimization workflow.
+    
+    This demonstrates:
+    - Loading baseline model
+    - Applying multiple optimization techniques
+    - Benchmarking systematically
+    - Generating submission
+    
+    Students should study this and adapt for their own strategies!
+    """
+    print("🏅 WORKED EXAMPLE: Complete Optimization Workflow")
+    print("=" * 70)
+    print("Target: All-Around Event (balanced performance)")
+    print("Strategy: Quantization (INT8) → Pruning (60%)")
+    print("=" * 70)
+    print()
+    
+    # Step 1: Load Baseline
+    print("📦 Step 1: Load Baseline Model")
+    print("-" * 70)
+    baseline = load_baseline_model("cifar10_cnn")
+    baseline_metrics = generate_baseline("cifar10_cnn", quick=True)
+    print()
+    
+    # Step 2: Apply Quantization
+    print("🔧 Step 2: Apply INT8 Quantization (Module 17)")
+    print("-" * 70)
+    print("💡 Why quantize? Reduces memory 4x (FP32 → INT8)")
+    
+    # For demonstration, we'll simulate quantization
+    # In real competition, students would use:
+    # from tinytorch.optimization.quantization import quantize_model
+    # optimized = quantize_model(baseline, bits=8)
+    
+    print("✅ Quantized model (simulated)")
+    print("   - Memory: 12.4MB → 3.1MB (4x reduction)")
+    print()
+    
+    # Step 3: Apply Pruning
+    print("✂️  Step 3: Apply Magnitude Pruning (Module 18)")
+    print("-" * 70)
+    print("💡 Why prune? Removes 60% of weights for faster inference")
+    
+    # For demonstration, we'll simulate pruning
+    # In real competition, students would use:
+    # from tinytorch.optimization.compression import magnitude_prune
+    # optimized = magnitude_prune(optimized, sparsity=0.6)
+    
+    print("✅ Pruned model (simulated)")
+    print("   - Active parameters: 3.2M → 1.28M (60% removed)")
+    print()
+    
+    # Step 4: Benchmark Results
+    print("📊 Step 4: Benchmark Optimized Model (Module 19)")
+    print("-" * 70)
+    
+    # Simulated optimized metrics
+    optimized_metrics = {
+        "model": "Optimized_CIFAR10_CNN",
+        "accuracy": 83.5,  # Slight drop from aggressive optimization
+        "latency_ms": 22.1,
+        "memory_mb": 1.24,  # 4x quantization + 60% pruning
+        "parameters": 1280000,
+        "techniques": ["quantization_int8", "magnitude_prune_0.6"]
+    }
+    
+    print("Baseline vs Optimized:")
+    print(f"  Accuracy:    {baseline_metrics['accuracy']:.1f}% → {optimized_metrics['accuracy']:.1f}% (-1.5pp)")
+    print(f"  Latency:     {baseline_metrics['latency_ms']:.1f}ms → {optimized_metrics['latency_ms']:.1f}ms (2.0x faster ✅)")
+    print(f"  Memory:      {baseline_metrics['memory_mb']:.2f}MB → {optimized_metrics['memory_mb']:.2f}MB (10.0x smaller ✅)")
+    print(f"  Parameters:  {baseline_metrics['parameters']:,} → {optimized_metrics['parameters']:,} (60% fewer ✅)")
+    print()
+    
+    # Step 5: Generate Submission
+    print("📤 Step 5: Generate Competition Submission")
+    print("-" * 70)
+    
+    submission = {
+        "event": "all_around",
+        "athlete_name": "Example_Submission",
+        "baseline": baseline_metrics,
+        "optimized": optimized_metrics,
+        "improvements": {
+            "accuracy_drop": -1.5,
+            "latency_speedup": 2.0,
+            "memory_reduction": 10.0
+        },
+        "techniques_applied": ["quantization_int8", "magnitude_prune_0.6"],
+        "technique_order": "quantize_first_then_prune"
+    }
+    
+    print("✅ Submission generated!")
+    print(f"   Event: {submission['event']}")
+    print(f"   Techniques: {', '.join(submission['techniques_applied'])}")
+    print()
+    print("=" * 70)
+    print("🎯 This is the complete workflow!")
+    print("   Now it's your turn to implement your own optimization strategy.")
+    print("=" * 70)
+    
+    return submission
+
+# %% ../../modules/source/20_competition/competition_dev.ipynb 10
+def optimize_for_competition(baseline_model, event: str = "all_around", division: str = "closed"):
+    """
+    🏅 YOUR COMPETITION ENTRY - IMPLEMENT YOUR STRATEGY HERE!
+    
+    Args:
+        baseline_model: Starting model (use for Closed, optional for Open)
+        event: Category you're competing in
+            - "latency_sprint": Minimize latency
+            - "memory_challenge": Minimize memory
+            - "accuracy_contest": Maximize accuracy
+            - "all_around": Best balance
+            - "extreme_push": Most aggressive
+        division: "closed" or "open" - which track you chose
+    
+    Returns:
+        Your optimized model
+    
+    🔒 CLOSED DIVISION Example:
+        from tinytorch.optimization.quantization import quantize_model
+        from tinytorch.optimization.compression import magnitude_prune
+        
+        optimized = baseline_model
+        optimized = quantize_model(optimized, bits=8)
+        optimized = magnitude_prune(optimized, sparsity=0.7)
+        return optimized
+    
+    🔓 OPEN DIVISION Example:
+        # Build your own model OR
+        # Use your improved implementations from earlier modules
+        # (after you've modified and re-exported them)
+        
+        from tinytorch.models import YourCustomArchitecture
+        optimized = YourCustomArchitecture()
+        return optimized
+    """
+    
+    print(f"🏅 YOUR OPTIMIZATION STRATEGY FOR: {event}")
+    print("=" * 70)
+    
+    # Start with baseline
+    optimized_model = baseline_model
+    
+    # ============================================================
+    # YOUR CODE BELOW - Apply optimization techniques here!
+    # ============================================================
+    
+    # TODO: Students implement their optimization strategy
+    #
+    # Example strategies by event:
+    #
+    # Latency Sprint (speed priority):
+    #   - Heavy quantization (INT4 or INT8)
+    #   - Aggressive pruning (80-90%)
+    #   - Kernel fusion if applicable
+    #
+    # Memory Challenge (size priority):
+    #   - INT8 or INT4 quantization
+    #   - Aggressive pruning (70-90%)
+    #   - Compression techniques
+    #
+    # All-Around (balanced):
+    #   - INT8 quantization
+    #   - Moderate pruning (50-70%)
+    #   - Selective optimization
+    #
+    # Your strategy:
+    
+    
+    
+    # ============================================================
+    # YOUR CODE ABOVE
+    # ============================================================
+    
+    print("✅ Optimization complete!")
+    print("💡 Tip: Benchmark your result to see the impact!")
+    
+    return optimized_model
+
+#| export
+def validate_submission(submission: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Validate competition submission with sanity checks.
+    
+    This catches honest mistakes like unrealistic speedups or accidental training.
+    Honor code system - we trust but verify basic reasonableness.
+    
+    Args:
+        submission: Submission dictionary to validate
+        
+    Returns:
+        Dict with validation results and warnings
+    """
+    checks = []
+    warnings = []
+    errors = []
+    
+    # Extract metrics
+    normalized = submission.get("normalized_scores", {})
+    speedup = normalized.get("speedup", 1.0)
+    compression = normalized.get("compression_ratio", 1.0)
+    accuracy_delta = normalized.get("accuracy_delta", 0.0)
+    
+    # Check 1: Speedup is reasonable (not claiming impossible gains)
+    if speedup > 50:
+        errors.append(f"❌ Speedup {speedup:.1f}x seems unrealistic (>50x)")
+    elif speedup > 20:
+        warnings.append(f"⚠️  Speedup {speedup:.1f}x is very high - please verify measurements")
+    else:
+        checks.append(f"✅ Speedup {speedup:.2f}x is reasonable")
+    
+    # Check 2: Compression is reasonable
+    if compression > 32:
+        errors.append(f"❌ Compression {compression:.1f}x seems unrealistic (>32x)")
+    elif compression > 16:
+        warnings.append(f"⚠️  Compression {compression:.1f}x is very high - please verify")
+    else:
+        checks.append(f"✅ Compression {compression:.2f}x is reasonable")
+    
+    # Check 3: Accuracy didn't improve (Closed Division rule - no training allowed!)
+    division = submission.get("division", "closed")
+    if division == "closed" and accuracy_delta > 1.0:
+        errors.append(f"❌ Accuracy improved by {accuracy_delta:.1f}pp - did you accidentally train the model?")
+    elif accuracy_delta > 0.5:
+        warnings.append(f"⚠️  Accuracy improved by {accuracy_delta:.1f}pp - verify no training occurred")
+    else:
+        checks.append(f"✅ Accuracy change {accuracy_delta:+.2f}pp is reasonable")
+    
+    # Check 4: GitHub repo provided
+    github_repo = submission.get("github_repo", "")
+    if not github_repo or github_repo == "":
+        warnings.append("⚠️  No GitHub repo provided - required for verification")
+    else:
+        checks.append(f"✅ GitHub repo provided: {github_repo}")
+    
+    # Check 5: Required fields present
+    required_fields = ["division", "event", "athlete_name", "baseline", "optimized", "normalized_scores"]
+    missing = [f for f in required_fields if f not in submission]
+    if missing:
+        errors.append(f"❌ Missing required fields: {', '.join(missing)}")
+    else:
+        checks.append("✅ All required fields present")
+    
+    # Check 6: Techniques documented
+    techniques = submission.get("techniques_applied", [])
+    if not techniques or "TODO" in str(techniques):
+        warnings.append("⚠️  No optimization techniques listed")
+    else:
+        checks.append(f"✅ Techniques documented: {', '.join(techniques[:3])}...")
+    
+    return {
+        "valid": len(errors) == 0,
+        "checks": checks,
+        "warnings": warnings,
+        "errors": errors
+    }
+
+#| export
+def generate_submission(baseline_model, optimized_model, 
+                       division: str = "closed",
+                       event: str = "all_around",
+                       athlete_name: str = "YourName",
+                       github_repo: str = "",
+                       techniques: List[str] = None) -> Dict[str, Any]:
+    """
+    Generate standardized TinyMLPerf competition submission with normalized scoring.
+    
+    Args:
+        baseline_model: Original unoptimized model
+        optimized_model: Your optimized model
+        division: "closed" or "open"
+        event: Competition category (latency_sprint, memory_challenge, all_around, etc.)
+        athlete_name: Your name for submission
+        github_repo: GitHub repository URL for code verification
+        techniques: List of optimization techniques applied
+    
+    Returns:
+        Submission dictionary (will be saved as JSON)
+    """
+    print("📤 Generating TinyMLPerf Competition Submission...")
+    print("=" * 70)
+    
+    # Get baseline metrics
+    baseline_metrics = generate_baseline(quick=True)
+    
+    # Benchmark optimized model
+    print("🔬 Benchmarking optimized model...")
+    
+    # Use Profiler and Benchmark from Module 19
+    profiler = Profiler()
+    
+    # For demonstration, we'll use placeholder metrics
+    # In real competition, students would measure their actual optimized model
+    optimized_metrics = {
+        "model": getattr(optimized_model, 'name', 'Optimized_Model'),
+        "accuracy": 84.0,  # Would be measured with actual test set
+        "latency_ms": 28.0,  # Would be measured with profiler
+        "memory_mb": 4.0,  # Would be measured with profiler
+        "parameters": 2000000,  # Would be counted
+    }
+    
+    # Calculate normalized scores using Module 19's function
+    baseline_for_norm = {
+        "latency": baseline_metrics["latency_ms"],
+        "memory": baseline_metrics["memory_mb"],
+        "accuracy": baseline_metrics["accuracy"]
+    }
+    
+    optimized_for_norm = {
+        "latency": optimized_metrics["latency_ms"],
+        "memory": optimized_metrics["memory_mb"],
+        "accuracy": optimized_metrics["accuracy"]
+    }
+    
+    normalized_scores = calculate_normalized_scores(baseline_for_norm, optimized_for_norm)
+    
+    # Create submission with all required fields
+    submission = {
+        "division": division,
+        "event": event,
+        "athlete_name": athlete_name,
+        "github_repo": github_repo,
+        "baseline": baseline_metrics,
+        "optimized": optimized_metrics,
+        "normalized_scores": {
+            "speedup": normalized_scores["speedup"],
+            "compression_ratio": normalized_scores["compression_ratio"],
+            "accuracy_delta": normalized_scores["accuracy_delta"],
+            "efficiency_score": normalized_scores["efficiency_score"]
+        },
+        "techniques_applied": techniques or ["TODO: Document your optimization techniques"],
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+        "tinytorch_version": "0.1.0",
+        "honor_code": False  # Must be explicitly set to True after validation
+    }
+    
+    # Validate submission
+    print("\n🔍 Validating submission...")
+    validation = validate_submission(submission)
+    
+    # Display validation results
+    print("\n📋 Validation Results:")
+    for check in validation["checks"]:
+        print(f"  {check}")
+    for warning in validation["warnings"]:
+        print(f"  {warning}")
+    for error in validation["errors"]:
+        print(f"  {error}")
+    
+    if not validation["valid"]:
+        print("\n❌ Submission has errors - please fix before submitting")
+        return submission
+    
+    # Save to JSON
+    output_file = Path("submission.json")
+    with open(output_file, "w") as f:
+        json.dump(submission, f, indent=2)
+    
+    print(f"\n✅ Submission saved to: {output_file}")
+    print()
+    print("📊 Your Normalized Scores (MLPerf-style):")
+    print(f"  Division:        {division.upper()}")
+    print(f"  Event:           {event.replace('_', ' ').title()}")
+    print(f"  Speedup:         {normalized_scores['speedup']:.2f}x faster ⚡")
+    print(f"  Compression:     {normalized_scores['compression_ratio']:.2f}x smaller 💾")
+    print(f"  Accuracy:        {optimized_metrics['accuracy']:.1f}% (Δ {normalized_scores['accuracy_delta']:+.2f}pp)")
+    print(f"  Efficiency:      {normalized_scores['efficiency_score']:.2f}")
+    print()
+    print("📤 Next Steps:")
+    print("  1. Verify all metrics are correct")
+    print("  2. Push your code to GitHub (if not done)")
+    print("  3. Run: tito submit submission.json")
+    print("     (This will validate and prepare final submission)")
+    print()
+    print("=" * 70)
+    
+    return submission
diff --git a/tinytorch/data/__init__.py b/tinytorch/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tinytorch/data/loader.py b/tinytorch/data/loader.py
new file mode 100644
index 00000000..1018e73f
--- /dev/null
+++ b/tinytorch/data/loader.py
@@ -0,0 +1,262 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/XX_loader/loader_dev.py                 ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = ['Dataset', 'TensorDataset', 'DataLoader']
+
+# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 0
+#| default_exp data.loader
+#| export
+
+# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 2
+# Essential imports for data loading
+import numpy as np
+import random
+from typing import Iterator, Tuple, List, Optional, Union
+from abc import ABC, abstractmethod
+
+# Import real Tensor class from tinytorch package
+from ..core.tensor import Tensor
+
+# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 4
+class Dataset(ABC):
+    """
+    Abstract base class for all datasets.
+
+    Provides the fundamental interface that all datasets must implement:
+    - __len__(): Returns the total number of samples
+    - __getitem__(idx): Returns the sample at given index
+
+    TODO: Implement the abstract Dataset base class
+
+    APPROACH:
+    1. Use ABC (Abstract Base Class) to define interface
+    2. Mark methods as @abstractmethod to force implementation
+    3. Provide clear docstrings for subclasses
+
+    EXAMPLE:
+    >>> class MyDataset(Dataset):
+    ...     def __len__(self): return 100
+    ...     def __getitem__(self, idx): return idx
+    >>> dataset = MyDataset()
+    >>> print(len(dataset))  # 100
+    >>> print(dataset[42])   # 42
+
+    HINT: Abstract methods force subclasses to implement core functionality
+    """
+
+    ### BEGIN SOLUTION
+    @abstractmethod
+    def __len__(self) -> int:
+        """
+        Return the total number of samples in the dataset.
+
+        This method must be implemented by all subclasses to enable
+        len(dataset) calls and batch size calculations.
+        """
+        pass
+
+    @abstractmethod
+    def __getitem__(self, idx: int):
+        """
+        Return the sample at the given index.
+
+        Args:
+            idx: Index of the sample to retrieve (0 <= idx < len(dataset))
+
+        Returns:
+            The sample at index idx. Format depends on the dataset implementation.
+            Could be (data, label) tuple, single tensor, etc.
+        """
+        pass
+    ### END SOLUTION
+
+# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 7
+class TensorDataset(Dataset):
+    """
+    Dataset wrapping tensors for supervised learning.
+
+    Each sample is a tuple of tensors from the same index across all input tensors.
+    All tensors must have the same size in their first dimension.
+
+    TODO: Implement TensorDataset for tensor-based data
+
+    APPROACH:
+    1. Store all input tensors
+    2. Validate they have same first dimension (number of samples)
+    3. Return tuple of tensor slices for each index
+
+    EXAMPLE:
+    >>> features = Tensor([[1, 2], [3, 4], [5, 6]])  # 3 samples, 2 features each
+    >>> labels = Tensor([0, 1, 0])                    # 3 labels
+    >>> dataset = TensorDataset(features, labels)
+    >>> print(len(dataset))  # 3
+    >>> print(dataset[1])    # (Tensor([3, 4]), Tensor(1))
+
+    HINTS:
+    - Use *tensors to accept variable number of tensor arguments
+    - Check all tensors have same length in dimension 0
+    - Return tuple of tensor[idx] for all tensors
+    """
+
+    def __init__(self, *tensors):
+        """
+        Create dataset from multiple tensors.
+
+        Args:
+            *tensors: Variable number of Tensor objects
+
+        All tensors must have the same size in their first dimension.
+        """
+        ### BEGIN SOLUTION
+        assert len(tensors) > 0, "Must provide at least one tensor"
+
+        # Store all tensors
+        self.tensors = tensors
+
+        # Validate all tensors have same first dimension
+        first_size = len(tensors[0].data)  # Size of first dimension
+        for i, tensor in enumerate(tensors):
+            if len(tensor.data) != first_size:
+                raise ValueError(
+                    f"All tensors must have same size in first dimension. "
+                    f"Tensor 0: {first_size}, Tensor {i}: {len(tensor.data)}"
+                )
+        ### END SOLUTION
+
+    def __len__(self) -> int:
+        """Return number of samples (size of first dimension)."""
+        ### BEGIN SOLUTION
+        return len(self.tensors[0].data)
+        ### END SOLUTION
+
+    def __getitem__(self, idx: int) -> Tuple[Tensor, ...]:
+        """
+        Return tuple of tensor slices at given index.
+
+        Args:
+            idx: Sample index
+
+        Returns:
+            Tuple containing tensor[idx] for each input tensor
+        """
+        ### BEGIN SOLUTION
+        if idx >= len(self) or idx < 0:
+            raise IndexError(f"Index {idx} out of range for dataset of size {len(self)}")
+
+        # Return tuple of slices from all tensors
+        return tuple(Tensor(tensor.data[idx]) for tensor in self.tensors)
+        ### END SOLUTION
+
+# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 10
+class DataLoader:
+    """
+    Data loader with batching and shuffling support.
+
+    Wraps a dataset to provide batched iteration with optional shuffling.
+    Essential for efficient training with mini-batch gradient descent.
+
+    TODO: Implement DataLoader with batching and shuffling
+
+    APPROACH:
+    1. Store dataset, batch_size, and shuffle settings
+    2. Create iterator that groups samples into batches
+    3. Handle shuffling by randomizing indices
+    4. Collate individual samples into batch tensors
+
+    EXAMPLE:
+    >>> dataset = TensorDataset(Tensor([[1,2], [3,4], [5,6]]), Tensor([0,1,0]))
+    >>> loader = DataLoader(dataset, batch_size=2, shuffle=True)
+    >>> for batch in loader:
+    ...     features_batch, labels_batch = batch
+    ...     print(f"Features: {features_batch.shape}, Labels: {labels_batch.shape}")
+
+    HINTS:
+    - Use random.shuffle() for index shuffling
+    - Group consecutive samples into batches
+    - Stack individual tensors using np.stack()
+    """
+
+    def __init__(self, dataset: Dataset, batch_size: int, shuffle: bool = False):
+        """
+        Create DataLoader for batched iteration.
+
+        Args:
+            dataset: Dataset to load from
+            batch_size: Number of samples per batch
+            shuffle: Whether to shuffle data each epoch
+        """
+        ### BEGIN SOLUTION
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        ### END SOLUTION
+
+    def __len__(self) -> int:
+        """Return number of batches per epoch."""
+        ### BEGIN SOLUTION
+        # Calculate number of complete batches
+        return (len(self.dataset) + self.batch_size - 1) // self.batch_size
+        ### END SOLUTION
+
+    def __iter__(self) -> Iterator:
+        """Return iterator over batches."""
+        ### BEGIN SOLUTION
+        # Create list of indices
+        indices = list(range(len(self.dataset)))
+
+        # Shuffle if requested
+        if self.shuffle:
+            random.shuffle(indices)
+
+        # Yield batches
+        for i in range(0, len(indices), self.batch_size):
+            batch_indices = indices[i:i + self.batch_size]
+            batch = [self.dataset[idx] for idx in batch_indices]
+
+            # Collate batch - convert list of tuples to tuple of tensors
+            yield self._collate_batch(batch)
+        ### END SOLUTION
+
+    def _collate_batch(self, batch: List[Tuple[Tensor, ...]]) -> Tuple[Tensor, ...]:
+        """
+        Collate individual samples into batch tensors.
+
+        Args:
+            batch: List of sample tuples from dataset
+
+        Returns:
+            Tuple of batched tensors
+        """
+        ### BEGIN SOLUTION
+        if len(batch) == 0:
+            return ()
+
+        # Determine number of tensors per sample
+        num_tensors = len(batch[0])
+
+        # Group tensors by position
+        batched_tensors = []
+        for tensor_idx in range(num_tensors):
+            # Extract all tensors at this position
+            tensor_list = [sample[tensor_idx].data for sample in batch]
+
+            # Stack into batch tensor
+            batched_data = np.stack(tensor_list, axis=0)
+            batched_tensors.append(Tensor(batched_data))
+
+        return tuple(batched_tensors)
+        ### END SOLUTION
diff --git a/tinytorch/utils/data/__init__.py b/tinytorch/utils/data/__init__.py
new file mode 100644
index 00000000..61a7cb2a
--- /dev/null
+++ b/tinytorch/utils/data/__init__.py
@@ -0,0 +1,16 @@
+"""
+TinyTorch Data Loading Utilities
+
+Following torch.utils.data patterns, this module provides:
+- Dataset: Base class for all datasets
+- DataLoader: Batching and shuffling for training
+- Common datasets for learning
+
+This is Module 10 of TinyTorch.
+"""
+
+# Import from dataloader module
+from .dataloader import *
+
+# Make key classes easily accessible
+__all__ = ['Dataset', 'DataLoader', 'SimpleDataset', 'CIFAR10Dataset']
\ No newline at end of file