TinyTorch/modules/16_acceleration/module.yaml

name: "acceleration"
title: "Hardware Acceleration - The Simplest Optimization"
description: "Master the easiest optimization: using better backends! Learn why naive loops are slow, how cache-friendly blocking helps, and why NumPy provides 100x+ speedups."
learning_objectives:
  - "Understand CPU cache hierarchy and memory access performance bottlenecks"
  - "Implement cache-friendly blocked matrix multiplication algorithms"
  - "Build vectorized operations with optimized memory access patterns"
  - "Design transparent backend systems for automatic optimization selection"
  - "Measure and quantify real performance improvements scientifically"
  - "Apply systems thinking to optimization decisions in ML workflows"
prerequisites:
  - "Module 2: Tensor operations and NumPy fundamentals"
  - "Module 4: Linear layers and matrix multiplication"
  - "Understanding of basic algorithmic complexity (O notation)"
estimated_time: "3-4 hours"
difficulty: "Advanced"
tags:
  - "performance"
  - "optimization"
  - "systems"
  - "hardware"
  - "acceleration"
  - "cache"
  - "vectorization"
  - "backends"
exports:
  - "matmul_naive"
  - "matmul_blocked"
  - "matmul_numpy"
  - "OptimizedBackend"
  - "matmul"
  - "set_backend"
assessment:
  - "Understand why naive loops have poor cache performance"
  - "Implement cache-friendly blocked matrix multiplication showing 10-50x speedups"
  - "Recognize why NumPy provides 100x+ speedups over custom implementations"
  - "Build backend system that automatically chooses optimal implementations"
  - "Apply the 'free speedup' principle: use better tools, don't write faster code"