Files
cs249r_book/tinytorch/paper/benchmark_quick.py
Vijay Janapa Reddi c602f97364 feat: integrate TinyTorch into MLSysBook repository
TinyTorch educational deep learning framework now lives at tinytorch/

Structure:
- tinytorch/src/         - Source modules (single source of truth)
- tinytorch/tito/        - CLI tool
- tinytorch/tests/       - Test suite
- tinytorch/site/        - Jupyter Book website
- tinytorch/milestones/  - Historical ML implementations
- tinytorch/datasets/    - Educational datasets (tinydigits, tinytalks)
- tinytorch/assignments/ - NBGrader assignments
- tinytorch/instructor/  - Teaching materials

Workflows (with tinytorch- prefix):
- tinytorch-ci.yml           - CI/CD pipeline
- tinytorch-publish-dev.yml  - Dev site deployment
- tinytorch-publish-live.yml - Live site deployment
- tinytorch-build-pdf.yml    - PDF generation
- tinytorch-release-check.yml - Release validation

Repository Variables added:
- TINYTORCH_ROOT  = tinytorch
- TINYTORCH_SRC   = tinytorch/src
- TINYTORCH_SITE  = tinytorch/site
- TINYTORCH_TESTS = tinytorch/tests

All workflows use \${{ vars.TINYTORCH_* }} for path configuration.

Note: tinytorch/site/_static/favicon.svg kept as SVG (valid for favicons)
2025-12-05 19:23:18 -08:00

135 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Quick benchmark for Table 3 - uses reasonable approximations for slow operations
"""
import time
import numpy as np
import torch
def time_op(func, warmup=2, runs=5):
"""Time an operation"""
for _ in range(warmup):
func()
times = []
for _ in range(runs):
start = time.perf_counter()
func()
times.append(time.perf_counter() - start)
return np.mean(times)
# 1. MatMul benchmark
print("=== MatMul (1K×1K) ===")
a_pt = torch.randn(1000, 1000)
b_pt = torch.randn(1000, 1000)
pt_mm_time = time_op(lambda: torch.mm(a_pt, b_pt))
print(f"PyTorch: {pt_mm_time*1000:.1f} ms")
# Naive triple loop matmul
a_np = a_pt.numpy()
b_np = b_pt.numpy()
def naive_mm_single():
result = np.zeros((1000, 1000))
for i in range(1000):
for j in range(1000):
result[i, j] = np.dot(a_np[i, :], b_np[:, j]) # Inner loop uses numpy dot
return result
tt_mm_time = time_op(naive_mm_single, warmup=1, runs=3)
print(f"TinyTorch: {tt_mm_time*1000:.0f} ms")
print(f"Ratio: {tt_mm_time/pt_mm_time:.0f}×\n")
# 2. Conv2d benchmark - use tiny batch to estimate
print("=== Conv2d (CIFAR batch - estimated from small run) ===")
batch_full = 128
batch_tiny = 1 # Just 1 image for timing
input_pt = torch.randn(batch_tiny, 3, 32, 32)
conv_pt = torch.nn.Conv2d(3, 32, 5, bias=False)
pt_conv_time_tiny = time_op(lambda: conv_pt(input_pt))
pt_conv_time_full = pt_conv_time_tiny * batch_full # Linear scaling
print(f"PyTorch (batch={batch_full}): {pt_conv_time_full*1000:.0f} ms")
# Naive conv2d with 7 nested loops
input_np = input_pt.numpy()
weight_np = conv_pt.weight.detach().numpy()
def naive_conv2d():
B, C_in, H, W = input_np.shape
C_out, _, K_h, K_w = weight_np.shape
H_out, W_out = H - K_h + 1, W - K_w + 1
output = np.zeros((B, C_out, H_out, W_out))
for b in range(B):
for c_out in range(C_out):
for h in range(H_out):
for w in range(W_out):
for c_in in range(C_in):
for kh in range(K_h):
for kw in range(K_w):
output[b, c_out, h, w] += \
input_np[b, c_in, h+kh, w+kw] * \
weight_np[c_out, c_in, kh, kw]
return output
tt_conv_time_tiny = time_op(naive_conv2d, warmup=0, runs=1)
tt_conv_time_full = tt_conv_time_tiny * batch_full
print(f"TinyTorch (batch={batch_full}): {tt_conv_time_full:.1f} s")
print(f"Ratio: {tt_conv_time_full/pt_conv_time_full:.0f}×\n")
# 3. Softmax benchmark - pure Python loops
print("=== Softmax (10K elements) ===")
x_pt = torch.randn(10000)
pt_soft_time = time_op(lambda: torch.nn.functional.softmax(x_pt, dim=0), runs=20)
print(f"PyTorch: {pt_soft_time*1000:.3f} ms")
x_np = x_pt.numpy()
def pure_python_softmax():
"""Pure Python softmax without numpy vectorization"""
n = len(x_np)
# Find max
max_val = x_np[0]
for i in range(1, n):
if x_np[i] > max_val:
max_val = x_np[i]
# Compute exp and sum
exp_vals = []
sum_exp = 0.0
for i in range(n):
exp_val = np.exp(x_np[i] - max_val)
exp_vals.append(exp_val)
sum_exp += exp_val
# Normalize
result = [e / sum_exp for e in exp_vals]
return result
tt_soft_time = time_op(pure_python_softmax, warmup=1, runs=5)
print(f"TinyTorch: {tt_soft_time*1000:.0f} ms")
print(f"Ratio: {tt_soft_time/pt_soft_time:.0f}×\n")
# Generate LaTeX table
print("="*60)
print("LaTeX Table:")
print("="*60)
print(r"\begin{table}[t]")
print(r"\centering")
print(r"\caption{Runtime comparison: TinyTorch vs PyTorch (CPU).}")
print(r"\label{tab:performance}")
print(r"\small")
print(r"\begin{tabular}{@{}lrrr@{}}")
print(r"\toprule")
print(r"Operation & TinyTorch & PyTorch & Ratio \\")
print(r"\midrule")
print(f"\\texttt{{matmul}} (1K$\\times$1K) & {tt_mm_time*1000:.0f} ms & {pt_mm_time*1000:.1f} ms & {tt_mm_time/pt_mm_time:.0f}$\\times$ \\\\")
print(f"\\texttt{{conv2d}} (CIFAR batch) & {tt_conv_time_full:.1f} s & {pt_conv_time_full*1000:.0f} ms & {tt_conv_time_full/pt_conv_time_full:.0f}$\\times$ \\\\")
print(f"\\texttt{{softmax}} (10K elem) & {tt_soft_time*1000:.0f} ms & {pt_soft_time*1000:.2f} ms & {tt_soft_time/pt_soft_time:.0f}$\\times$ \\\\")
print(r"\bottomrule")
print(r"\end{tabular}")
print(r"\end{table}")
print("\n" + "="*60)
print(f"Summary: {tt_mm_time/pt_mm_time:.0f}× (matmul), {tt_conv_time_full/pt_conv_time_full:.0f}× (conv2d), {tt_soft_time/pt_soft_time:.0f}× (softmax)")
print(f"Average slowdown: {np.mean([tt_mm_time/pt_mm_time, tt_conv_time_full/pt_conv_time_full, tt_soft_time/pt_soft_time]):.0f}×")