mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-11 17:49:25 -05:00
TinyTorch educational deep learning framework now lives at tinytorch/
Structure:
- tinytorch/src/ - Source modules (single source of truth)
- tinytorch/tito/ - CLI tool
- tinytorch/tests/ - Test suite
- tinytorch/site/ - Jupyter Book website
- tinytorch/milestones/ - Historical ML implementations
- tinytorch/datasets/ - Educational datasets (tinydigits, tinytalks)
- tinytorch/assignments/ - NBGrader assignments
- tinytorch/instructor/ - Teaching materials
Workflows (with tinytorch- prefix):
- tinytorch-ci.yml - CI/CD pipeline
- tinytorch-publish-dev.yml - Dev site deployment
- tinytorch-publish-live.yml - Live site deployment
- tinytorch-build-pdf.yml - PDF generation
- tinytorch-release-check.yml - Release validation
Repository Variables added:
- TINYTORCH_ROOT = tinytorch
- TINYTORCH_SRC = tinytorch/src
- TINYTORCH_SITE = tinytorch/site
- TINYTORCH_TESTS = tinytorch/tests
All workflows use \${{ vars.TINYTORCH_* }} for path configuration.
Note: tinytorch/site/_static/favicon.svg kept as SVG (valid for favicons)
135 lines
4.6 KiB
Python
135 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Quick benchmark for Table 3 - uses reasonable approximations for slow operations
|
||
"""
|
||
|
||
import time
|
||
import numpy as np
|
||
import torch
|
||
|
||
def time_op(func, warmup=2, runs=5):
|
||
"""Time an operation"""
|
||
for _ in range(warmup):
|
||
func()
|
||
times = []
|
||
for _ in range(runs):
|
||
start = time.perf_counter()
|
||
func()
|
||
times.append(time.perf_counter() - start)
|
||
return np.mean(times)
|
||
|
||
# 1. MatMul benchmark
|
||
print("=== MatMul (1K×1K) ===")
|
||
a_pt = torch.randn(1000, 1000)
|
||
b_pt = torch.randn(1000, 1000)
|
||
pt_mm_time = time_op(lambda: torch.mm(a_pt, b_pt))
|
||
print(f"PyTorch: {pt_mm_time*1000:.1f} ms")
|
||
|
||
# Naive triple loop matmul
|
||
a_np = a_pt.numpy()
|
||
b_np = b_pt.numpy()
|
||
def naive_mm_single():
|
||
result = np.zeros((1000, 1000))
|
||
for i in range(1000):
|
||
for j in range(1000):
|
||
result[i, j] = np.dot(a_np[i, :], b_np[:, j]) # Inner loop uses numpy dot
|
||
return result
|
||
|
||
tt_mm_time = time_op(naive_mm_single, warmup=1, runs=3)
|
||
print(f"TinyTorch: {tt_mm_time*1000:.0f} ms")
|
||
print(f"Ratio: {tt_mm_time/pt_mm_time:.0f}×\n")
|
||
|
||
# 2. Conv2d benchmark - use tiny batch to estimate
|
||
print("=== Conv2d (CIFAR batch - estimated from small run) ===")
|
||
batch_full = 128
|
||
batch_tiny = 1 # Just 1 image for timing
|
||
|
||
input_pt = torch.randn(batch_tiny, 3, 32, 32)
|
||
conv_pt = torch.nn.Conv2d(3, 32, 5, bias=False)
|
||
pt_conv_time_tiny = time_op(lambda: conv_pt(input_pt))
|
||
pt_conv_time_full = pt_conv_time_tiny * batch_full # Linear scaling
|
||
print(f"PyTorch (batch={batch_full}): {pt_conv_time_full*1000:.0f} ms")
|
||
|
||
# Naive conv2d with 7 nested loops
|
||
input_np = input_pt.numpy()
|
||
weight_np = conv_pt.weight.detach().numpy()
|
||
|
||
def naive_conv2d():
|
||
B, C_in, H, W = input_np.shape
|
||
C_out, _, K_h, K_w = weight_np.shape
|
||
H_out, W_out = H - K_h + 1, W - K_w + 1
|
||
output = np.zeros((B, C_out, H_out, W_out))
|
||
|
||
for b in range(B):
|
||
for c_out in range(C_out):
|
||
for h in range(H_out):
|
||
for w in range(W_out):
|
||
for c_in in range(C_in):
|
||
for kh in range(K_h):
|
||
for kw in range(K_w):
|
||
output[b, c_out, h, w] += \
|
||
input_np[b, c_in, h+kh, w+kw] * \
|
||
weight_np[c_out, c_in, kh, kw]
|
||
return output
|
||
|
||
tt_conv_time_tiny = time_op(naive_conv2d, warmup=0, runs=1)
|
||
tt_conv_time_full = tt_conv_time_tiny * batch_full
|
||
print(f"TinyTorch (batch={batch_full}): {tt_conv_time_full:.1f} s")
|
||
print(f"Ratio: {tt_conv_time_full/pt_conv_time_full:.0f}×\n")
|
||
|
||
# 3. Softmax benchmark - pure Python loops
|
||
print("=== Softmax (10K elements) ===")
|
||
x_pt = torch.randn(10000)
|
||
pt_soft_time = time_op(lambda: torch.nn.functional.softmax(x_pt, dim=0), runs=20)
|
||
print(f"PyTorch: {pt_soft_time*1000:.3f} ms")
|
||
|
||
x_np = x_pt.numpy()
|
||
def pure_python_softmax():
|
||
"""Pure Python softmax without numpy vectorization"""
|
||
n = len(x_np)
|
||
# Find max
|
||
max_val = x_np[0]
|
||
for i in range(1, n):
|
||
if x_np[i] > max_val:
|
||
max_val = x_np[i]
|
||
|
||
# Compute exp and sum
|
||
exp_vals = []
|
||
sum_exp = 0.0
|
||
for i in range(n):
|
||
exp_val = np.exp(x_np[i] - max_val)
|
||
exp_vals.append(exp_val)
|
||
sum_exp += exp_val
|
||
|
||
# Normalize
|
||
result = [e / sum_exp for e in exp_vals]
|
||
return result
|
||
|
||
tt_soft_time = time_op(pure_python_softmax, warmup=1, runs=5)
|
||
print(f"TinyTorch: {tt_soft_time*1000:.0f} ms")
|
||
print(f"Ratio: {tt_soft_time/pt_soft_time:.0f}×\n")
|
||
|
||
# Generate LaTeX table
|
||
print("="*60)
|
||
print("LaTeX Table:")
|
||
print("="*60)
|
||
print(r"\begin{table}[t]")
|
||
print(r"\centering")
|
||
print(r"\caption{Runtime comparison: TinyTorch vs PyTorch (CPU).}")
|
||
print(r"\label{tab:performance}")
|
||
print(r"\small")
|
||
print(r"\begin{tabular}{@{}lrrr@{}}")
|
||
print(r"\toprule")
|
||
print(r"Operation & TinyTorch & PyTorch & Ratio \\")
|
||
print(r"\midrule")
|
||
print(f"\\texttt{{matmul}} (1K$\\times$1K) & {tt_mm_time*1000:.0f} ms & {pt_mm_time*1000:.1f} ms & {tt_mm_time/pt_mm_time:.0f}$\\times$ \\\\")
|
||
print(f"\\texttt{{conv2d}} (CIFAR batch) & {tt_conv_time_full:.1f} s & {pt_conv_time_full*1000:.0f} ms & {tt_conv_time_full/pt_conv_time_full:.0f}$\\times$ \\\\")
|
||
print(f"\\texttt{{softmax}} (10K elem) & {tt_soft_time*1000:.0f} ms & {pt_soft_time*1000:.2f} ms & {tt_soft_time/pt_soft_time:.0f}$\\times$ \\\\")
|
||
print(r"\bottomrule")
|
||
print(r"\end{tabular}")
|
||
print(r"\end{table}")
|
||
|
||
print("\n" + "="*60)
|
||
print(f"Summary: {tt_mm_time/pt_mm_time:.0f}× (matmul), {tt_conv_time_full/pt_conv_time_full:.0f}× (conv2d), {tt_soft_time/pt_soft_time:.0f}× (softmax)")
|
||
print(f"Average slowdown: {np.mean([tt_mm_time/pt_mm_time, tt_conv_time_full/pt_conv_time_full, tt_soft_time/pt_soft_time]):.0f}×")
|