docs(paper): clarify Adam memory overhead + fix LaTeX labels

- Clarify "3× memory" → "3× optimizer-related memory (gradients plus
  two state buffers)" for accuracy
- Fix lstlisting label syntax: label=lst:... → label={lst:...}
- Remove stale figure reference, use prose instead
- Fix convolutions comment: 6 → 7 nested loops (matches actual code)
- Remove unused benchmark_table3.py
This commit is contained in:
Vijay Janapa Reddi
2026-01-28 17:19:55 -05:00
parent 5d4476d81c
commit eb84df491e
3 changed files with 11 additions and 292 deletions

View File

@@ -1,281 +0,0 @@
#!/usr/bin/env python3
"""
Benchmark script to generate real performance numbers for Table 3 in the paper.
Compares TinyTorch implementations against PyTorch on CPU.
"""
import time
import numpy as np
import torch
import sys
from pathlib import Path
# Add TinyTorch to path
repo_root = Path(__file__).parent.parent
sys.path.insert(0, str(repo_root))
# Import TinyTorch components
try:
from tinytorch.core import Tensor as TTTensor
from tinytorch.nn import Conv2d as TTConv2d
TINYTORCH_AVAILABLE = True
except ImportError:
print("Warning: TinyTorch not available. Will create mock implementations.")
TINYTORCH_AVAILABLE = False
def benchmark_function(func, *args, warmup=3, runs=10):
"""Benchmark a function with warmup and multiple runs."""
# Warmup
for _ in range(warmup):
func(*args)
# Actual timing
times = []
for _ in range(runs):
start = time.perf_counter()
func(*args)
end = time.perf_counter()
times.append(end - start)
return np.mean(times), np.std(times)
def benchmark_matmul():
"""Benchmark matrix multiplication: 1000x1000 @ 1000x1000"""
print("\n=== Benchmarking Matrix Multiplication (1K×1K) ===")
# PyTorch
pt_a = torch.randn(1000, 1000)
pt_b = torch.randn(1000, 1000)
def pt_matmul():
return torch.mm(pt_a, pt_b)
pt_mean, pt_std = benchmark_function(pt_matmul)
print(f"PyTorch: {pt_mean*1000:.2f} ms ± {pt_std*1000:.2f} ms")
# TinyTorch
if TINYTORCH_AVAILABLE:
# Use TinyTorch's actual implementation
tt_a = TTTensor(pt_a.numpy())
tt_b = TTTensor(pt_b.numpy())
def tt_matmul():
return tt_a @ tt_b
tt_mean, tt_std = benchmark_function(tt_matmul, warmup=1, runs=5)
print(f"TinyTorch: {tt_mean*1000:.2f} ms ± {tt_std*1000:.2f} ms")
else:
# Pure Python naive implementation
a = pt_a.numpy()
b = pt_b.numpy()
def naive_matmul():
n, m, p = a.shape[0], a.shape[1], b.shape[1]
result = np.zeros((n, p))
for i in range(n):
for j in range(p):
for k in range(m):
result[i, j] += a[i, k] * b[k, j]
return result
tt_mean, tt_std = benchmark_function(naive_matmul, warmup=1, runs=3)
print(f"TinyTorch (naive): {tt_mean*1000:.2f} ms ± {tt_std*1000:.2f} ms")
ratio = tt_mean / pt_mean
print(f"Ratio: {ratio:.0f}×")
return pt_mean * 1000, tt_mean * 1000, ratio
def benchmark_conv2d():
"""Benchmark Conv2d on CIFAR batch: (128, 3, 32, 32) through 32 filters 5×5"""
print("\n=== Benchmarking Conv2d (CIFAR batch) ===")
batch_size = 128
in_channels = 3
out_channels = 32
kernel_size = 5
# PyTorch
pt_input = torch.randn(batch_size, in_channels, 32, 32)
pt_conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size, bias=False)
def pt_conv2d():
return pt_conv(pt_input)
pt_mean, pt_std = benchmark_function(pt_conv2d)
print(f"PyTorch: {pt_mean*1000:.2f} ms ± {pt_std*1000:.2f} ms")
# TinyTorch
if TINYTORCH_AVAILABLE:
try:
# Use TinyTorch's actual Conv2d implementation
tt_input = TTTensor(pt_input.numpy())
tt_conv = TTConv2d(in_channels, out_channels, kernel_size, bias=False)
# Copy PyTorch weights for fair comparison
tt_conv.weight.data = pt_conv.weight.detach().numpy()
def tt_conv2d():
return tt_conv(tt_input)
tt_mean, tt_std = benchmark_function(tt_conv2d, warmup=1, runs=3)
print(f"TinyTorch: {tt_mean:.2f} s ± {tt_std:.2f} s")
except Exception as e:
print(f"TinyTorch Conv2d failed: {e}")
print("Falling back to naive implementation with smaller batch...")
tt_mean = benchmark_conv2d_naive_small(pt_conv.weight.detach().numpy())
else:
# Use smaller batch size for naive implementation (too slow otherwise)
print("Using smaller batch (8 instead of 128) for naive implementation...")
tt_mean = benchmark_conv2d_naive_small(pt_conv.weight.detach().numpy())
ratio = tt_mean / pt_mean
print(f"Ratio: {ratio:.0f}×")
return pt_mean * 1000, tt_mean, ratio
def benchmark_conv2d_naive_small(weight_np):
"""Benchmark naive conv2d with smaller batch for speed"""
batch_size_small = 8 # Reduced from 128
in_channels = 3
kernel_size = 5
input_small = np.random.randn(batch_size_small, in_channels, 32, 32)
def naive_conv2d():
"""7 nested loops as shown in the paper"""
B, C_in, H, W = input_small.shape
C_out, C_in_w, K_h, K_w = weight_np.shape
H_out = H - K_h + 1
W_out = W - K_w + 1
output = np.zeros((B, C_out, H_out, W_out))
for b in range(B):
for c_out in range(C_out):
for h in range(H_out):
for w in range(W_out):
for c_in in range(C_in):
for kh in range(K_h):
for kw in range(K_w):
output[b, c_out, h, w] += \
input_small[b, c_in, h+kh, w+kw] * \
weight_np[c_out, c_in, kh, kw]
return output
tt_mean, tt_std = benchmark_function(naive_conv2d, warmup=0, runs=1)
print(f"TinyTorch (batch=8): {tt_mean:.2f} s ± {tt_std:.2f} s")
# Extrapolate to full batch size (linear scaling)
extrapolated = tt_mean * (128 / 8)
print(f"TinyTorch (extrapolated to batch=128): {extrapolated:.2f} s")
return extrapolated
def benchmark_softmax():
"""Benchmark softmax on 10K elements"""
print("\n=== Benchmarking Softmax (10K elements) ===")
size = 10000
# PyTorch
pt_input = torch.randn(size)
def pt_softmax():
return torch.nn.functional.softmax(pt_input, dim=0)
pt_mean, pt_std = benchmark_function(pt_softmax)
print(f"PyTorch: {pt_mean*1000:.3f} ms ± {pt_std*1000:.3f} ms")
# TinyTorch - pure Python implementation
input_np = pt_input.numpy()
def naive_softmax():
"""Pure Python softmax"""
# Subtract max for numerical stability
x = input_np - np.max(input_np)
exp_x = np.exp(x)
return exp_x / np.sum(exp_x)
tt_mean, tt_std = benchmark_function(naive_softmax, warmup=2, runs=10)
print(f"TinyTorch: {tt_mean*1000:.3f} ms ± {tt_std*1000:.3f} ms")
ratio = tt_mean / pt_mean
print(f"Ratio: {ratio:.0f}×")
return pt_mean * 1000, tt_mean * 1000, ratio
def format_time(ms):
"""Format time in appropriate units"""
if ms < 1:
return f"{ms:.2f} ms"
elif ms < 1000:
return f"{ms:.1f} ms"
else:
return f"{ms/1000:.1f} s"
def main():
print("=" * 60)
print("TinyTorch vs PyTorch Performance Benchmark")
print("=" * 60)
print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"TinyTorch available: {TINYTORCH_AVAILABLE}")
print("=" * 60)
results = {}
# Run benchmarks
results['matmul'] = benchmark_matmul()
results['conv2d'] = benchmark_conv2d()
results['softmax'] = benchmark_softmax()
# Print LaTeX table
print("\n" + "=" * 60)
print("LaTeX Table for paper:")
print("=" * 60)
print(r"\begin{table}[t]")
print(r"\centering")
print(r"\caption{Runtime comparison: TinyTorch vs PyTorch (CPU).}")
print(r"\label{tab:performance}")
print(r"\small")
print(r"\begin{tabular}{@{}lrrr@{}}")
print(r"\toprule")
print(r"Operation & TinyTorch & PyTorch & Ratio \\")
print(r"\midrule")
# Format matmul
pt_mm, tt_mm, ratio_mm = results['matmul']
print(f"\\texttt{{matmul}} (1K$\\times$1K) & {tt_mm:.0f} ms & {pt_mm:.1f} ms & {ratio_mm:.0f}$\\times$ \\\\")
# Format conv2d
pt_conv, tt_conv, ratio_conv = results['conv2d']
print(f"\\texttt{{conv2d}} (CIFAR batch) & {tt_conv:.1f} s & {pt_conv:.0f} ms & {ratio_conv:.0f}$\\times$ \\\\")
# Format softmax
pt_soft, tt_soft, ratio_soft = results['softmax']
print(f"\\texttt{{softmax}} (10K elem) & {tt_soft:.0f} ms & {pt_soft:.2f} ms & {ratio_soft:.0f}$\\times$ \\\\")
print(r"\midrule")
print(r"CIFAR-10 epoch (LeNet) & \textit{TBD} & \textit{TBD} & \textit{TBD} \\")
print(r"\bottomrule")
print(r"\end{tabular}")
print(r"\end{table}")
print("\n" + "=" * 60)
print("Summary:")
print("=" * 60)
print(f"MatMul (1K×1K): {ratio_mm:6.0f}× slower")
print(f"Conv2d (CIFAR): {ratio_conv:6.0f}× slower")
print(f"Softmax (10K): {ratio_soft:6.0f}× slower")
print(f"Average slowdown: {np.mean([ratio_mm, ratio_conv, ratio_soft]):6.0f}×")
if __name__ == "__main__":
main()

View File

@@ -221,7 +221,7 @@
% Abstract - REVISED: Curriculum design focus
\begin{abstract}
Machine learning education faces a fundamental gap: students learn algorithms without understanding the systems that execute them. They study gradient descent without measuring memory, attention mechanisms without analyzing $O(N^2)$ scaling, optimizer theory without knowing why Adam requires $3\times$ the memory of SGD. This \emph{algorithm-systems divide} produces practitioners who can train models but cannot debug memory failures, optimize inference latency, or reason about deployment trade-offs—the very skills industry demands as ``ML systems engineering.'' We present TinyTorch, a 20-module curriculum that closes this gap through \emph{implementation-based systems pedagogy}: students construct PyTorch's core components (tensors, autograd, optimizers, CNNs, transformers) in pure Python, building a complete framework where every operation they invoke is code they wrote. The design employs three patterns: \emph{progressive disclosure} of complexity, \emph{systems-first integration} of profiling from the first module, and \emph{build-to-validate milestones} recreating 67 years of ML breakthroughs—from Perceptron (1958) through Transformers (2017) to MLPerf-style benchmarking. Requiring only 4GB RAM and no GPU, TinyTorch demonstrates that deep ML systems understanding is achievable without specialized hardware. The curriculum is available open-source at \texttt{mlsysbook.ai/tinytorch}.
Machine learning education faces a fundamental gap: students learn algorithms without understanding the systems that execute them. They study gradient descent without measuring memory, attention mechanisms without analyzing $O(N^2)$ scaling, optimizer theory without knowing why Adam requires $3\times$ the optimizer-related memory of SGD (gradients plus two state buffers). This \emph{algorithm-systems divide} produces practitioners who can train models but cannot debug memory failures, optimize inference latency, or reason about deployment trade-offs—the very skills industry demands as ``ML systems engineering.'' We present TinyTorch, a 20-module curriculum that closes this gap through \emph{implementation-based systems pedagogy}: students construct PyTorch's core components (tensors, autograd, optimizers, CNNs, transformers) in pure Python, building a complete framework where every operation they invoke is code they wrote. The design employs three patterns: \emph{progressive disclosure} of complexity, \emph{systems-first integration} of profiling from the first module, and \emph{build-to-validate milestones} recreating 67 years of ML breakthroughs—from Perceptron (1958) through Transformers (2017) to MLPerf-style benchmarking. Requiring only 4GB RAM and no GPU, TinyTorch demonstrates that deep ML systems understanding is achievable without specialized hardware. The curriculum is available open-source at \texttt{mlsysbook.ai/tinytorch}.
\end{abstract}
% Main content
@@ -342,7 +342,7 @@ for batch in DataLoader(data):
\end{figure*}
Building systems knowledge alongside ML fundamentals presents three pedagogical challenges: teaching systems thinking early without overwhelming beginners (\Cref{sec:systems}), managing cognitive load when teaching both algorithms and implementation (\Cref{sec:progressive}), and validating student understanding through concrete milestones (\Cref{subsec:milestones}). TinyTorch addresses these through curriculum design inspired by compiler courses~\citep{aho2006compilers}: students build a complete system incrementally, with each module adding functionality while maintaining a working implementation. \Cref{fig:module-flow} illustrates this progression: tensors (Module 01) enable activations (02) and layers (03), which feed into dataloader (05) and autograd (06), powering optimizers (07) and training (08). Each completed module becomes immediately usable: after Module 03, students build neural networks; after Module 06, automatic differentiation enables training; after Module 13, transformers support language modeling. This structure enables students to gradually construct mental models while seeing immediate results.
Building systems knowledge alongside ML fundamentals presents three pedagogical challenges: teaching systems thinking early without overwhelming beginners (\Cref{sec:systems}), managing cognitive load when teaching both algorithms and implementation (\Cref{sec:progressive}), and validating student understanding through concrete milestones (\Cref{subsec:milestones}). TinyTorch addresses these through curriculum design inspired by compiler courses~\citep{aho2006compilers}: students build a complete system incrementally, with each module adding functionality while maintaining a working implementation. This progression follows a deliberate dependency chain: tensors (Module 01) enable activations (02) and layers (03), which feed into dataloader (05) and autograd (06), powering optimizers (07) and training (08). Each completed module becomes immediately usable: after Module 03, students build neural networks; after Module 06, automatic differentiation enables training; after Module 13, transformers support language modeling. This structure enables students to gradually construct mental models while seeing immediate results.
TinyTorch is meant for students transitioning from framework \emph{users} to framework \emph{engineers}: those who have completed introductory ML courses (e.g., CS229, fast.ai) and want to understand PyTorch internals, those planning ML systems research or infrastructure careers, or practitioners debugging production deployment issues. The curriculum assumes NumPy proficiency and basic neural network familiarity but teaches framework architecture from first principles. Students needing immediate GPU/distributed training skills are better served by PyTorch tutorials; those preferring project-based application building will find high-level frameworks more appropriate. The 20-module structure supports flexible pacing: intensive completion, semester integration (parallel with lectures), or independent professional development.
@@ -527,7 +527,7 @@ TinyTorch organizes modules into three progressive tiers plus a capstone competi
\end{tabularx}
\end{table*}
\begin{lstlisting}[caption={\textbf{Memory Profiling.} Tensor implementation from Module 01 with explicit memory tracking.},label=lst:tensor-memory,float=t]
\begin{lstlisting}[caption={\textbf{Memory Profiling.} Tensor implementation from Module 01 with explicit memory tracking.},label={lst:tensor-memory},float=t]
class Tensor:
def __init__(self, data):
self.data = np.array(data, dtype=np.float32)
@@ -546,7 +546,7 @@ class Tensor:
\end{lstlisting}
\textbf{Tier 1: Foundation (Modules 01--08).}
Students build the mathematical core enabling neural networks to learn, following a deliberate \emph{Forward Pass $\rightarrow$ Learning Infrastructure $\rightarrow$ Training} progression. Modules 01--04 construct forward pass components in the order data flows: tensors (data structure), activations (non-linearity), layers (parameterized transformations), and losses (objective functions). Systems thinking begins immediately: Module 01 introduces \texttt{memory\_footprint()} before matrix multiplication (\Cref{lst:tensor-memory}), making memory a first-class concept. Modules 05--07 build learning infrastructure: data loading (Module 05) provides efficient batching, then automatic differentiation (Module 06) enables gradient computation through progressive disclosure (\Cref{sec:progressive}), and optimizers (Module 07) use those gradients for parameter updates. Students discover Adam's 3$\times$ memory overhead through direct measurement (\Cref{sec:systems}). The training loop (Module 08) integrates all components. This order is the minimal dependency chain: you cannot build optimizers without autograd (no gradients), cannot build autograd without losses (nothing to differentiate), cannot build losses without layers (no predictions). By tier completion, students recreate three historical milestones: \citet{rosenblatt1958perceptron}'s Perceptron, Minsky and Papert's XOR solution, and \citet{rumelhart1986learning}'s backpropagation targeting 95\%+ on MNIST.
Students build the mathematical core enabling neural networks to learn, following a deliberate \emph{Forward Pass $\rightarrow$ Learning Infrastructure $\rightarrow$ Training} progression. Modules 01--04 construct forward pass components in the order data flows: tensors (data structure), activations (non-linearity), layers (parameterized transformations), and losses (objective functions). Systems thinking begins immediately: Module 01 introduces \texttt{memory\_footprint()} before matrix multiplication (\Cref{lst:tensor-memory}), making memory a first-class concept. Modules 05--07 build learning infrastructure: data loading (Module 05) provides efficient batching, then automatic differentiation (Module 06) enables gradient computation through progressive disclosure (\Cref{sec:progressive}), and optimizers (Module 07) use those gradients for parameter updates. Students discover Adam's 3$\times$ optimizer-related memory overhead (gradients plus two state buffers) through direct measurement (\Cref{sec:systems}). The training loop (Module 08) integrates all components. This order is the minimal dependency chain: you cannot build optimizers without autograd (no gradients), cannot build autograd without losses (nothing to differentiate), cannot build losses without layers (no predictions). By tier completion, students recreate three historical milestones: \citet{rosenblatt1958perceptron}'s Perceptron, Minsky and Papert's XOR solution, and \citet{rumelhart1986learning}'s backpropagation targeting 95\%+ on MNIST.
\textbf{Tier 2: Architectures (Modules 09--13).}
Students apply foundation knowledge to modern architectures, with the tier branching into parallel \emph{Vision} and \emph{Language} tracks. This bifurcation reflects domain-specific requirements: vision processes spatial grids (images), while language processes variable-length sequences (text). Both tracks build on the DataLoader patterns from Module 05 and training infrastructure from Module 08. TinyTorch ships with two custom educational datasets: \textbf{TinyDigits} ($\sim$1,000 grayscale handwritten digits) and \textbf{TinyTalks} ($\sim$350 conversational Q\&A pairs). These datasets are deliberately small and offline-first: they require no network connectivity during training, consume minimal storage ($<$50MB combined), and train in minutes on CPU-only hardware. This design ensures accessibility for students in regions with limited internet infrastructure, institutional computer labs with restricted network access, and developing countries where cloud-based datasets create barriers to ML education.
@@ -633,7 +633,7 @@ This section details how TinyTorch implements progressive disclosure: a pattern
TinyTorch's Module 01 \texttt{Tensor} class focuses exclusively on core tensor operations: data storage, arithmetic, matrix multiplication, and shape manipulation (\Cref{lst:foundation-tensor}). No gradient-related attributes exist yet, so students learn tensor fundamentals without cognitive overhead from features they won't use for five more modules. In Module 06, the \texttt{enable\_autograd()} function dynamically enhances \texttt{Tensor} with gradient tracking capabilities through monkey-patching (\Cref{lst:activation}). \Cref{fig:progressive-timeline} visualizes this enhancement timeline across the curriculum.
\begin{lstlisting}[caption={\textbf{Foundation Tensor.} Module 01 Tensor focuses on core operations. No gradient infrastructure exists yet; students learn tensor fundamentals first.},label=lst:foundation-tensor,float=t]
\begin{lstlisting}[caption={\textbf{Foundation Tensor.} Module 01 Tensor focuses on core operations. No gradient infrastructure exists yet; students learn tensor fundamentals first.},label={lst:foundation-tensor},float=t]
# Module 01: Foundation Tensor
class Tensor:
def __init__(self, data):
@@ -651,7 +651,7 @@ class Tensor:
return Tensor(self.data * other.data)
\end{lstlisting}
\begin{lstlisting}[caption={\textbf{Autograd Enhancement.} Module 06 monkey-patches Tensor to add gradient tracking. The original \texttt{\_\_init\_\_} is wrapped to accept \texttt{requires\_grad}, and operations are enhanced to build computation graphs.},label=lst:activation,float=t]
\begin{lstlisting}[caption={\textbf{Autograd Enhancement.} Module 06 monkey-patches Tensor to add gradient tracking. The original \texttt{\_\_init\_\_} is wrapped to accept \texttt{requires\_grad}, and operations are enhanced to build computation graphs.},label={lst:activation},float=t]
def enable_autograd():
"""Enhance Tensor with gradient tracking"""
_original_init = Tensor.__init__
@@ -794,7 +794,7 @@ Students learn to distinguish parameter memory (model weights) from optimizer st
Module 09 introduces convolution with seven explicit nested loops (\Cref{lst:conv-explicit}), making $O(B \times C_{\text{out}} \times H_{\text{out}} \times W_{\text{out}} \times C_{\text{in}} \times K_h \times K_w)$ complexity visible and countable.
\begin{lstlisting}[caption={\textbf{Explicit Convolution.} Seven nested loops reveal $O(C_{out} \times H \times W \times C_{in} \times K^2)$ complexity.},label=lst:conv-explicit,float=t]
\begin{lstlisting}[caption={\textbf{Explicit Convolution.} Seven nested loops reveal $O(C_{out} \times H \times W \times C_{in} \times K^2)$ complexity.},label={lst:conv-explicit},float=t]
def conv2d_explicit(input, weight):
"""7 nested loops - see the complexity!
input: (B, C_in, H, W)
@@ -930,7 +930,7 @@ TinyTorch supports three deployment environments: \textbf{JupyterHub} (instituti
\textbf{NBGrader Module Structure Example}: Each module uses NBGrader markdown cells to define assessment points and structure. For example, Module 01's memory profiling exercise:
\begin{lstlisting}[caption={\textbf{NBGrader Structure.} Cell metadata defines point allocation and solution delimiters.},label=lst:nbgrader-example,float=t]
\begin{lstlisting}[caption={\textbf{NBGrader Structure.} Cell metadata defines point allocation and solution delimiters.},label={lst:nbgrader-example},float=t]
# Cell metadata defines grading parameters:
# nbgrader = {
# "grade": true,
@@ -966,7 +966,7 @@ Unlike tutorial-style notebooks creating isolated code, TinyTorch modules export
As students complete modules, their framework accumulates capabilities. After Module 03, students can import and use layers; after Module 06, autograd enables training; after Module 09, CNNs become available. This progressive accumulation creates tangible evidence of progress: students see their framework grow from basic tensors to a complete ML system. \Cref{lst:progressive-imports} illustrates how imports expand as modules are completed:
\begin{lstlisting}[caption={\textbf{Progressive Imports.} Framework capabilities grow module-by-module as students complete implementations.},label=lst:progressive-imports,float=t]
\begin{lstlisting}[caption={\textbf{Progressive Imports.} Framework capabilities grow module-by-module as students complete implementations.},label={lst:progressive-imports},float=t]
# After Module 01: Basic tensors
from tinytorch import Tensor

View File

@@ -512,7 +512,7 @@ class Conv2d:
1. Extract input dimensions and validate
2. Calculate output dimensions
3. Apply padding if needed
4. Implement 6 nested loops for full convolution
4. Implement 7 nested loops for full convolution
5. Add bias if present
LOOP STRUCTURE:
@@ -580,7 +580,7 @@ class Conv2d:
# Initialize output
output = np.zeros((batch_size, out_channels, out_height, out_width))
# Explicit 6-nested loop convolution to show complexity
# Explicit 7-nested loop convolution to show complexity
for b in range(batch_size):
for out_ch in range(out_channels):
for out_h in range(out_height):