mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2025-12-05 19:17:52 -06:00
Add consistent Aha Moment demos to all 20 modules
Each module now includes a self-contained demo function that:
- Uses the 🎯 emoji for consistency with MODULE SUMMARY
- Explains what was built and why it matters
- Provides a quick, visual demonstration
- Runs automatically after test_module() in __main__
Format: demo_[module_name]() with markdown explanation before it.
All demos are self-contained with no cross-module imports.
This commit is contained in:
@@ -256,13 +256,13 @@ Perfect if you want to **debug ML systems**, **implement custom operations**, or
|
||||
Add yourself to the map • Share your progress • Connect with builders
|
||||
</p>
|
||||
<div style="display: flex; gap: 1rem; justify-content: center; flex-wrap: wrap;">
|
||||
<a href="https://tinytorch.ai/join" target="_blank"
|
||||
style="display: inline-block; background: linear-gradient(135deg, #f97316 0%, #ea580c 100%);
|
||||
color: white; padding: 0.75rem 2rem; border-radius: 0.5rem;
|
||||
text-decoration: none; font-weight: 600; font-size: 1rem;
|
||||
box-shadow: 0 4px 6px rgba(0,0,0,0.2);">
|
||||
Join the Map →
|
||||
</a>
|
||||
<a href="https://tinytorch.ai/join" target="_blank"
|
||||
style="display: inline-block; background: linear-gradient(135deg, #f97316 0%, #ea580c 100%);
|
||||
color: white; padding: 0.75rem 2rem; border-radius: 0.5rem;
|
||||
text-decoration: none; font-weight: 600; font-size: 1rem;
|
||||
box-shadow: 0 4px 6px rgba(0,0,0,0.2);">
|
||||
Join the Map →
|
||||
</a>
|
||||
<a href="#" onclick="event.preventDefault(); if(window.openSubscribeModal) openSubscribeModal();"
|
||||
style="display: inline-block; background: rgba(255,255,255,0.1);
|
||||
border: 1px solid rgba(255,255,255,0.2);
|
||||
|
||||
@@ -1993,7 +1993,7 @@ def test_unit_complete_pipeline():
|
||||
# history = pipeline.train(dataloader, epochs=1)
|
||||
# assert 'losses' in history, "History should contain losses"
|
||||
# assert len(history['losses']) == 1, "Should have one epoch of losses"
|
||||
|
||||
|
||||
# Skip optimization test as it depends on training
|
||||
# pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
|
||||
|
||||
|
||||
@@ -340,10 +340,38 @@ class Tensor:
|
||||
f"Cannot perform matrix multiplication: {self.shape} @ {other.shape}. "
|
||||
f"Inner dimensions must match: {self.shape[-1]} ≠ {other.shape[-2]}"
|
||||
)
|
||||
result_data = np.matmul(self.data, other.data)
|
||||
|
||||
# Educational implementation: explicit loops to show what matrix multiplication does
|
||||
# This is intentionally slower than np.matmul to demonstrate the value of vectorization
|
||||
# In Module 18 (Acceleration), students will learn to use optimized BLAS operations
|
||||
|
||||
a = self.data
|
||||
b = other.data
|
||||
|
||||
# Handle 2D matrices with explicit loops (educational)
|
||||
if len(a.shape) == 2 and len(b.shape) == 2:
|
||||
M, K = a.shape
|
||||
K2, N = b.shape
|
||||
result_data = np.zeros((M, N), dtype=a.dtype)
|
||||
|
||||
# Explicit nested loops - students can see exactly what's happening!
|
||||
# Each output element is a dot product of a row from A and a column from B
|
||||
for i in range(M):
|
||||
for j in range(N):
|
||||
# Dot product of row i from A with column j from B
|
||||
result_data[i, j] = np.dot(a[i, :], b[:, j])
|
||||
else:
|
||||
# For batched operations (3D+), use np.matmul for correctness
|
||||
# Students will understand this once they grasp the 2D case
|
||||
result_data = np.matmul(a, b)
|
||||
|
||||
return Tensor(result_data)
|
||||
### END SOLUTION
|
||||
|
||||
def __matmul__(self, other):
|
||||
"""Enable @ operator for matrix multiplication."""
|
||||
return self.matmul(other)
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Enable indexing and slicing operations on Tensors."""
|
||||
### BEGIN SOLUTION
|
||||
@@ -1528,6 +1556,54 @@ def custom_activation(tensor):
|
||||
**Key insight**: Algorithmic complexity (Big-O) doesn't tell the whole performance story. Constant factors from vectorization, cache behavior, and parallelism dominate in practice.
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Your Tensor Works Like NumPy
|
||||
|
||||
**What you built:** A complete Tensor class with arithmetic operations and matrix multiplication.
|
||||
|
||||
**Why it matters:** Your Tensor is the foundation of everything to come. Every neural network
|
||||
operation—from simple addition to complex attention mechanisms—will use this class. The fact
|
||||
that it works exactly like NumPy means you've built something production-ready.
|
||||
|
||||
In the next modules, you'll add activations, layers, and autograd on top of this foundation.
|
||||
Every operation you just implemented will be called millions of times during training!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_tensor():
|
||||
"""🎯 See your Tensor work just like NumPy."""
|
||||
print("🎯 AHA MOMENT: Your Tensor Works Like NumPy")
|
||||
print("=" * 45)
|
||||
|
||||
# Create tensors
|
||||
a = Tensor(np.array([1, 2, 3]))
|
||||
b = Tensor(np.array([4, 5, 6]))
|
||||
|
||||
# Tensor operations
|
||||
tensor_sum = a + b
|
||||
tensor_prod = a * b
|
||||
|
||||
# NumPy equivalents
|
||||
np_sum = np.array([1, 2, 3]) + np.array([4, 5, 6])
|
||||
np_prod = np.array([1, 2, 3]) * np.array([4, 5, 6])
|
||||
|
||||
print(f"Tensor a + b: {tensor_sum.data}")
|
||||
print(f"NumPy a + b: {np_sum}")
|
||||
print(f"Match: {np.allclose(tensor_sum.data, np_sum)}")
|
||||
|
||||
print(f"\nTensor a * b: {tensor_prod.data}")
|
||||
print(f"NumPy a * b: {np_prod}")
|
||||
print(f"Match: {np.allclose(tensor_prod.data, np_prod)}")
|
||||
|
||||
print("\n✨ Your Tensor is NumPy-compatible—ready for ML!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_tensor()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 MODULE SUMMARY: Tensor Foundation
|
||||
|
||||
@@ -1071,6 +1071,47 @@ class Sigmoid:
|
||||
```
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Activations Transform Data
|
||||
|
||||
**What you built:** Five activation functions that introduce nonlinearity to neural networks.
|
||||
|
||||
**Why it matters:** Without activations, stacking layers would just be matrix multiplication—
|
||||
a linear operation. ReLU's simple "zero out negatives" rule is what allows networks to learn
|
||||
complex patterns like recognizing faces or understanding language.
|
||||
|
||||
In the next module, you'll combine these activations with Linear layers to build complete
|
||||
neural network architectures. The nonlinearity you just implemented is the secret sauce!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_activations():
|
||||
"""🎯 See how activations transform data."""
|
||||
print("🎯 AHA MOMENT: Activations Transform Data")
|
||||
print("=" * 45)
|
||||
|
||||
# Test input with positive and negative values
|
||||
x = Tensor(np.array([-2.0, -1.0, 0.0, 1.0, 2.0]))
|
||||
print(f"Input: {x.data}")
|
||||
|
||||
# ReLU - zeros out negatives
|
||||
relu = ReLU()
|
||||
relu_out = relu(x)
|
||||
print(f"ReLU: {relu_out.data} ← Negatives become 0!")
|
||||
|
||||
# Sigmoid - squashes to (0, 1)
|
||||
sigmoid = Sigmoid()
|
||||
sigmoid_out = sigmoid(x)
|
||||
print(f"Sigmoid: {np.round(sigmoid_out.data, 2)} ← Squashed to (0,1)")
|
||||
|
||||
print("\n✨ Activations add nonlinearity—the key to deep learning!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_activations()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1110,11 +1110,47 @@ if __name__ == "__main__":
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ MODULE 03 COMPLETE!")
|
||||
print("=" * 70)
|
||||
print("\nNext steps:")
|
||||
print("1. Review the ML Systems Questions above")
|
||||
print("2. Export with: tito module complete 03_layers")
|
||||
print("3. Continue to Module 04: Loss Functions")
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Layers Transform Shapes
|
||||
|
||||
**What you built:** Linear layers that transform data from one dimension to another.
|
||||
|
||||
**Why it matters:** A Linear layer is the workhorse of neural networks. The transformation
|
||||
from 784 features (a flattened 28×28 image) to 10 classes (digits 0-9) is exactly what
|
||||
happens in digit recognition. You just built the core component!
|
||||
|
||||
In the next module, you'll add loss functions that measure how wrong predictions are.
|
||||
Combined with your layers, this creates the foundation for learning.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_layers():
|
||||
"""🎯 See how layers transform shapes."""
|
||||
print("🎯 AHA MOMENT: Layers Transform Shapes")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a layer that transforms 784 → 10 (like MNIST)
|
||||
layer = Linear(784, 10)
|
||||
|
||||
# Simulate a batch of 32 flattened images
|
||||
batch = Tensor(np.random.randn(32, 784))
|
||||
|
||||
# Forward pass
|
||||
output = layer(batch)
|
||||
|
||||
print(f"Input shape: {batch.shape} ← 32 images, 784 pixels each")
|
||||
print(f"Output shape: {output.shape} ← 32 images, 10 classes each")
|
||||
print(f"Parameters: {784 * 10 + 10:,} (weights + biases)")
|
||||
|
||||
print("\n✨ Your layer transforms images to class predictions!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_layers()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1620,6 +1620,53 @@ optimizer.step() # Update once with accumulated gradients
|
||||
These questions test your systems understanding of loss functions - not just "how do they work" but "how do they behave in production at scale." Keep these considerations in mind as you build real ML systems!
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Loss Guides Learning
|
||||
|
||||
**What you built:** Loss functions that measure how wrong predictions are.
|
||||
|
||||
**Why it matters:** Without loss, there's no learning. The loss function is the "coach"
|
||||
that tells the network whether its predictions are good or bad. Lower loss = better
|
||||
predictions. Every training step aims to reduce this number.
|
||||
|
||||
In the next module, you'll add autograd which computes gradients of this loss—the
|
||||
direction to adjust weights to make predictions better!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_losses():
|
||||
"""🎯 See how loss responds to prediction quality."""
|
||||
print("🎯 AHA MOMENT: Loss Guides Learning")
|
||||
print("=" * 45)
|
||||
|
||||
loss_fn = MSELoss()
|
||||
target = Tensor(np.array([1.0, 0.0, 0.0]))
|
||||
|
||||
# Perfect prediction
|
||||
perfect = Tensor(np.array([1.0, 0.0, 0.0]))
|
||||
loss_perfect = loss_fn(perfect, target)
|
||||
|
||||
# Close prediction
|
||||
close = Tensor(np.array([0.9, 0.1, 0.1]))
|
||||
loss_close = loss_fn(close, target)
|
||||
|
||||
# Wrong prediction
|
||||
wrong = Tensor(np.array([0.0, 1.0, 1.0]))
|
||||
loss_wrong = loss_fn(wrong, target)
|
||||
|
||||
print(f"Perfect prediction → Loss: {float(loss_perfect.data):.4f}")
|
||||
print(f"Close prediction → Loss: {float(loss_close.data):.4f}")
|
||||
print(f"Wrong prediction → Loss: {float(loss_wrong.data):.4f}")
|
||||
|
||||
print("\n✨ Lower loss = better predictions! Training minimizes this.")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_losses()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 MODULE SUMMARY: Losses
|
||||
|
||||
@@ -2150,6 +2150,47 @@ After answering these questions, consider:
|
||||
These questions prepare you for Module 06 (Optimizers), where you'll use these gradients to actually update parameters and train models!
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Gradients Flow Automatically
|
||||
|
||||
**What you built:** An autograd engine that computes gradients through computation graphs.
|
||||
|
||||
**Why it matters:** Before autograd, you had to derive and code gradients by hand for every
|
||||
operation—error-prone and tedious. Your engine does this automatically! When you call
|
||||
`backward()`, gradients flow from the loss back through every operation to every parameter.
|
||||
|
||||
This is the magic behind deep learning. PyTorch, TensorFlow, and JAX all have autograd
|
||||
engines at their core. You just built one yourself!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_autograd():
|
||||
"""🎯 See gradients computed automatically."""
|
||||
print("🎯 AHA MOMENT: Gradients Flow Automatically")
|
||||
print("=" * 45)
|
||||
|
||||
# Simple example: y = x^2, so dy/dx = 2x
|
||||
x = Tensor(np.array([3.0]), requires_grad=True)
|
||||
y = x * x # y = x²
|
||||
|
||||
print(f"x = {x.data[0]}")
|
||||
print(f"y = x² = {y.data[0]}")
|
||||
|
||||
# Backward pass computes gradient
|
||||
y.backward()
|
||||
|
||||
print(f"\ndy/dx = 2x = 2 × {x.data[0]} = {x.grad.data[0]}")
|
||||
print(f"Computed automatically: {x.grad.data[0]}")
|
||||
|
||||
print("\n✨ Gradients computed automatically—no manual derivatives!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_autograd()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 MODULE SUMMARY: Autograd Engine
|
||||
|
||||
@@ -1456,10 +1456,46 @@ def test_module():
|
||||
print("🎉 ALL TESTS PASSED! Module ready for export.")
|
||||
print("Run: tito module complete 06_optimizers")
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Optimizers Update Weights
|
||||
|
||||
**What you built:** Optimization algorithms (SGD, Adam) that update neural network weights.
|
||||
|
||||
**Why it matters:** Gradients tell us which direction reduces the loss, but someone has to
|
||||
actually move the weights. That's what optimizers do! SGD takes simple steps, while Adam
|
||||
adapts the learning rate for each parameter—like having a personal trainer for each weight.
|
||||
|
||||
In the next module, you'll combine optimizers with a training loop to actually train networks!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_optimizers():
|
||||
"""🎯 See optimizers update weights."""
|
||||
print("🎯 AHA MOMENT: Optimizers Update Weights")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a parameter with a gradient
|
||||
weight = Tensor(np.array([5.0]), requires_grad=True)
|
||||
weight.grad = np.array([1.0]) # Gradient pointing "uphill"
|
||||
|
||||
print(f"Initial weight: {weight.data[0]:.2f}")
|
||||
print(f"Gradient: {weight.grad[0]:.2f} (pointing uphill)")
|
||||
|
||||
# SGD takes a step in the opposite direction
|
||||
optimizer = SGD([weight], lr=0.5)
|
||||
optimizer.step()
|
||||
|
||||
print(f"\nAfter SGD step: {weight.data[0]:.2f}")
|
||||
print(f"Moved: {5.0 - weight.data[0]:.2f} (opposite to gradient)")
|
||||
|
||||
print("\n✨ Optimizer moves weights to reduce loss!")
|
||||
|
||||
# %%
|
||||
# Run comprehensive module test
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_optimizers()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1268,10 +1268,60 @@ def test_module():
|
||||
print("🎉 ALL TESTS PASSED! Module ready for export.")
|
||||
print("Run: tito module complete 07")
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "main", "locked": false, "solution": false}
|
||||
# Run comprehensive module test when executed directly
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Training Just Works
|
||||
|
||||
**What you built:** A complete training infrastructure with Trainer, schedulers, and checkpoints.
|
||||
|
||||
**Why it matters:** You've assembled all the pieces: tensors → layers → losses → autograd →
|
||||
optimizers → training loop. This is the complete ML training pipeline! The Trainer orchestrates
|
||||
forward pass, loss computation, backward pass, and weight updates—just like PyTorch Lightning.
|
||||
|
||||
In the milestones, you'll use this training infrastructure to train real models on real data!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_training():
|
||||
"""🎯 See the training loop in action."""
|
||||
print("🎯 AHA MOMENT: Training Just Works")
|
||||
print("=" * 45)
|
||||
|
||||
# Simple linear regression: learn y = 2x + 1
|
||||
np.random.seed(42)
|
||||
X = Tensor(np.random.randn(20, 1))
|
||||
y = Tensor(X.data * 2 + 1) # True relationship
|
||||
|
||||
# Simple model: one weight, one bias
|
||||
w = Tensor(np.array([[0.0]]), requires_grad=True)
|
||||
b = Tensor(np.array([0.0]), requires_grad=True)
|
||||
|
||||
optimizer = SGD([w, b], lr=0.1)
|
||||
loss_fn = MSELoss()
|
||||
|
||||
print("Learning y = 2x + 1:")
|
||||
for epoch in range(5):
|
||||
# Forward
|
||||
pred = X.matmul(w) + b
|
||||
loss = loss_fn(pred, y)
|
||||
|
||||
# Backward
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
print(f" Epoch {epoch+1}: w={w.data[0,0]:.2f}, b={b.data[0]:.2f}, loss={float(loss.data):.4f}")
|
||||
|
||||
print(f"\nLearned: y = {w.data[0,0]:.1f}x + {b.data[0]:.1f}")
|
||||
print("Target: y = 2.0x + 1.0")
|
||||
|
||||
print("\n✨ Your training loop learned the pattern!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_training()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1618,13 +1618,48 @@ def test_module():
|
||||
print("🎉 ALL TESTS PASSED! Module ready for export.")
|
||||
print("Run: tito module complete 08")
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: DataLoader Batches Your Data
|
||||
|
||||
**What you built:** A data pipeline that efficiently batches and shuffles training data.
|
||||
|
||||
**Why it matters:** Neural networks learn better from shuffled, batched data. Your DataLoader
|
||||
handles all of this—grouping samples into batches for efficient vectorized operations, and
|
||||
shuffling each epoch to prevent the model from memorizing the order.
|
||||
|
||||
In the milestones, you'll use this DataLoader to feed real images to your networks!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_dataloader():
|
||||
"""🎯 See your DataLoader batch data correctly."""
|
||||
print("🎯 AHA MOMENT: DataLoader Batches Your Data")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a dataset
|
||||
X = Tensor(np.random.randn(100, 64))
|
||||
y = Tensor(np.arange(100))
|
||||
dataset = TensorDataset(X, y)
|
||||
|
||||
# Create DataLoader with batching
|
||||
loader = DataLoader(dataset, batch_size=32, shuffle=True)
|
||||
|
||||
print(f"Dataset: {len(dataset)} samples")
|
||||
print(f"Batch size: 32")
|
||||
print(f"Number of batches: {len(loader)}")
|
||||
|
||||
print("\nBatches:")
|
||||
for i, (batch_x, batch_y) in enumerate(loader):
|
||||
print(f" Batch {i+1}: {batch_x.shape[0]} samples, shape {batch_x.shape}")
|
||||
|
||||
print("\n✨ Your DataLoader organizes data for efficient training!")
|
||||
|
||||
# %%
|
||||
# Run comprehensive module test
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
|
||||
|
||||
|
||||
print("\n")
|
||||
demo_dataloader()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -2252,6 +2252,46 @@ Why do mobile ML models prefer depthwise-separable convolutions over standard Co
|
||||
**These questions help you think like an ML systems engineer, not just an algorithm implementer.**
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Convolution Extracts Features
|
||||
|
||||
**What you built:** Convolutional layers that process spatial data like images.
|
||||
|
||||
**Why it matters:** Conv2d looks at local neighborhoods, detecting edges, textures, and patterns.
|
||||
Unlike Linear layers that see pixels independently, Conv2d understands that nearby pixels are
|
||||
related. This is why CNNs revolutionized computer vision!
|
||||
|
||||
In the milestones, you'll use these spatial operations to build a CNN that recognizes digits.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_spatial():
|
||||
"""🎯 See Conv2d process spatial data."""
|
||||
print("🎯 AHA MOMENT: Convolution Extracts Features")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a simple 8x8 "image" with 1 channel
|
||||
image = Tensor(np.random.randn(1, 1, 8, 8))
|
||||
|
||||
# Conv2d: 1 input channel → 4 feature maps
|
||||
conv = Conv2d(in_channels=1, out_channels=4, kernel_size=3)
|
||||
|
||||
output = conv(image)
|
||||
|
||||
print(f"Input: {image.shape} ← 1 image, 1 channel, 8×8")
|
||||
print(f"Output: {output.shape} ← 1 image, 4 features, 6×6")
|
||||
print(f"\nConv kernel: 3×3 sliding window")
|
||||
print(f"Output smaller: 8 - 3 + 1 = 6 (no padding)")
|
||||
|
||||
print("\n✨ Conv2d detects spatial patterns in images!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_spatial()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 9. Module Summary
|
||||
|
||||
@@ -1547,11 +1547,46 @@ def test_module():
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Text Becomes Tokens
|
||||
|
||||
**What you built:** Tokenizers that convert text into numerical sequences.
|
||||
|
||||
**Why it matters:** Neural networks can't read text—they need numbers! Your tokenizer bridges
|
||||
this gap, converting words into token IDs that can be embedded and processed. Every language
|
||||
model from GPT to Claude uses tokenization as the first step.
|
||||
|
||||
In the next module, you'll convert these tokens into dense vector representations.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_tokenization():
|
||||
"""🎯 See text become tokens."""
|
||||
print("🎯 AHA MOMENT: Text Becomes Tokens")
|
||||
print("=" * 45)
|
||||
|
||||
# Create and fit a character tokenizer
|
||||
tokenizer = CharTokenizer()
|
||||
text = "hello world"
|
||||
tokenizer.fit(text)
|
||||
|
||||
# Encode and decode
|
||||
tokens = tokenizer.encode("hello")
|
||||
decoded = tokenizer.decode(tokens)
|
||||
|
||||
print(f"Original: '{text}'")
|
||||
print(f"Vocab size: {tokenizer.vocab_size}")
|
||||
print(f"\nEncode 'hello': {tokens}")
|
||||
print(f"Decode back: '{decoded}'")
|
||||
|
||||
print("\n✨ Text → numbers → text (perfect round-trip)!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
print("🚀 Running Tokenization module...")
|
||||
test_module()
|
||||
print("✅ Module validation complete!")
|
||||
print("\n")
|
||||
demo_tokenization()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1358,12 +1358,46 @@ def test_module():
|
||||
print("\n🚀 Ready for: Attention mechanisms, transformers, and language models!")
|
||||
print("Export with: tito module complete 11")
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "main-execution", "solution": true}
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Tokens Become Vectors
|
||||
|
||||
**What you built:** An embedding layer that converts token IDs to dense vectors.
|
||||
|
||||
**Why it matters:** Tokens are just integers (like word IDs), but embeddings give them meaning!
|
||||
Each token gets a learned vector that captures its semantic properties. Similar words end up
|
||||
with similar vectors—this is how models understand language.
|
||||
|
||||
In the next module, you'll use attention to let these embeddings interact with each other.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_embeddings():
|
||||
"""🎯 See tokens become vectors."""
|
||||
print("🎯 AHA MOMENT: Tokens Become Vectors")
|
||||
print("=" * 45)
|
||||
|
||||
# Create embedding layer: 100 vocab, 32-dimensional embeddings
|
||||
embed = Embedding(vocab_size=100, embed_dim=32)
|
||||
|
||||
# Some token IDs
|
||||
tokens = Tensor(np.array([5, 10, 15]))
|
||||
|
||||
# Look up embeddings
|
||||
vectors = embed(tokens)
|
||||
|
||||
print(f"Token IDs: {tokens.data}")
|
||||
print(f"Embedding shape: {vectors.shape} ← 3 tokens, 32 dims each")
|
||||
print(f"\nToken 5 vector (first 5 dims): {vectors.data[0, :5].round(3)}")
|
||||
print(f"Token 10 vector (first 5 dims): {vectors.data[1, :5].round(3)}")
|
||||
|
||||
print("\n✨ Each token has its own learned representation!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
"""Main execution block for module validation."""
|
||||
print("🚀 Running Embeddings module...")
|
||||
test_module()
|
||||
print("✅ Module validation complete!")
|
||||
print("\n")
|
||||
demo_embeddings()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1181,6 +1181,49 @@ Training requires storing activations for backward pass. How much extra memory d
|
||||
- For GPT-3 scale (96 layers, 2048 context): _____ GB just for attention gradients
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Attention Finds Relationships
|
||||
|
||||
**What you built:** Attention mechanisms that let tokens interact with each other.
|
||||
|
||||
**Why it matters:** Before attention, models processed tokens independently. Attention lets
|
||||
each token "look at" every other token and decide what's relevant. This is how transformers
|
||||
understand that "it" refers to "the cat" in a sentence!
|
||||
|
||||
In the next module, you'll combine attention with MLPs to build full transformer blocks.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_attention():
|
||||
"""🎯 See attention compute relationships."""
|
||||
print("🎯 AHA MOMENT: Attention Finds Relationships")
|
||||
print("=" * 45)
|
||||
|
||||
# Create Q, K, V for 4 tokens with 8-dim embeddings
|
||||
Q = Tensor(np.random.randn(1, 4, 8))
|
||||
K = Tensor(np.random.randn(1, 4, 8))
|
||||
V = Tensor(np.random.randn(1, 4, 8))
|
||||
|
||||
# Compute attention
|
||||
output, weights = scaled_dot_product_attention(Q, K, V)
|
||||
|
||||
print(f"Sequence length: 4 tokens")
|
||||
print(f"Embedding dim: 8")
|
||||
print(f"\nAttention weights shape: {weights.shape}")
|
||||
print(f"Each token attends to all 4 positions!")
|
||||
|
||||
print(f"\nToken 0 attention: {weights.data[0, 0, :].round(2)}")
|
||||
print("(sums to 1.0 - it's a probability distribution)")
|
||||
|
||||
print("\n✨ Attention lets tokens communicate!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_attention()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 MODULE SUMMARY: Attention
|
||||
|
||||
@@ -1697,12 +1697,48 @@ def test_module():
|
||||
# Call the comprehensive test
|
||||
# test_module() # Only run in __main__ block below
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Transformer Processes Sequences
|
||||
|
||||
**What you built:** A complete transformer block with attention, MLPs, and residual connections.
|
||||
|
||||
**Why it matters:** This is THE architecture behind GPT, Claude, LLaMA, and every modern
|
||||
language model. The transformer block combines attention (for relationships) with MLPs
|
||||
(for processing) and residual connections (for trainability).
|
||||
|
||||
In the milestones, you'll stack these blocks to build a working language model!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_transformers():
|
||||
"""🎯 See a transformer block process a sequence."""
|
||||
print("🎯 AHA MOMENT: Transformer Processes Sequences")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a transformer block
|
||||
block = TransformerBlock(embed_dim=64, num_heads=4, ff_dim=256)
|
||||
|
||||
# Input: batch of 2 sequences, 8 tokens each, 64 dims
|
||||
x = Tensor(np.random.randn(2, 8, 64))
|
||||
|
||||
output = block(x)
|
||||
|
||||
print(f"Input shape: {x.shape}")
|
||||
print(f"Output shape: {output.shape}")
|
||||
print(f"\nTransformerBlock contains:")
|
||||
print(f" • Multi-head attention (4 heads)")
|
||||
print(f" • MLP (64 → 256 → 64)")
|
||||
print(f" • Residual connections")
|
||||
print(f" • Layer normalization")
|
||||
|
||||
print("\n✨ The building block of GPT, Claude, and more!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
print("🚀 Running Transformers module...")
|
||||
demonstrate_transformer_integration()
|
||||
test_module()
|
||||
print("✅ Module validation complete!")
|
||||
print("\n")
|
||||
demo_transformers()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1739,11 +1739,49 @@ def test_module():
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Know Your Model
|
||||
|
||||
**What you built:** A profiler that measures parameters, FLOPs, memory, and latency.
|
||||
|
||||
**Why it matters:** You can't optimize what you can't measure! Before making a model faster
|
||||
or smaller, you need to know where the time and memory go. Your profiler reveals these secrets,
|
||||
telling you exactly what your model costs in compute and memory.
|
||||
|
||||
In the next modules, you'll use profiling to guide quantization and compression decisions.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_profiling():
|
||||
"""🎯 See your profiler reveal model secrets."""
|
||||
print("🎯 AHA MOMENT: Know Your Model")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a simple model
|
||||
layer = Linear(784, 128)
|
||||
|
||||
# Profile it
|
||||
profiler = Profiler()
|
||||
params = profiler.count_parameters(layer)
|
||||
flops = profiler.count_flops(layer, input_shape=(1, 784))
|
||||
|
||||
print(f"Model: Linear(784 → 128)")
|
||||
print(f"\nParameters: {params:,}")
|
||||
print(f" = 784 × 128 weights + 128 biases")
|
||||
|
||||
print(f"\nFLOPs: {flops:,}")
|
||||
print(f" = 784 × 128 × 2 (multiply-add per output)")
|
||||
|
||||
print(f"\nMemory: {params * 4 / 1024:.1f} KB (at FP32)")
|
||||
|
||||
print("\n✨ Profiling reveals optimization opportunities!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
print("🚀 Running Profiling module...")
|
||||
test_module()
|
||||
print("✅ Module validation complete!")
|
||||
print("\n")
|
||||
demo_profiling()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1880,6 +1880,46 @@ In mobile/edge deployment scenarios:
|
||||
### END SOLUTION
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Quantization Shrinks Models
|
||||
|
||||
**What you built:** Quantization that converts FP32 weights to INT8, reducing model size 4×.
|
||||
|
||||
**Why it matters:** A 400MB model becomes 100MB—small enough to run on a phone! Quantization
|
||||
is how production ML deploys large models to edge devices. The 4× reduction comes from using
|
||||
8 bits per weight instead of 32 bits.
|
||||
|
||||
In the MLPerf milestone, you'll see quantization in action, measuring real memory savings.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_quantization():
|
||||
"""🎯 See quantization shrink model size."""
|
||||
print("🎯 AHA MOMENT: Quantization Shrinks Models")
|
||||
print("=" * 45)
|
||||
|
||||
# Create FP32 weights
|
||||
weights = Tensor(np.random.randn(256, 128).astype(np.float32))
|
||||
original_bytes = weights.data.nbytes
|
||||
|
||||
# Quantize to INT8
|
||||
q_weights, scale, zero_point = Quantizer.quantize_tensor(weights)
|
||||
quantized_bytes = q_weights.data.size # 1 byte per INT8 element
|
||||
|
||||
print(f"Original FP32: {original_bytes:,} bytes")
|
||||
print(f"Quantized INT8: {quantized_bytes:,} bytes")
|
||||
print(f"\nCompression: {original_bytes / quantized_bytes:.0f}× smaller!")
|
||||
print(f"INT8 range: [{q_weights.data.min()}, {q_weights.data.max()}]")
|
||||
|
||||
print("\n✨ Same values, 4× less memory!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_quantization()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 MODULE SUMMARY: Quantization
|
||||
|
||||
@@ -1780,6 +1780,50 @@ For deploying on a mobile device with 50MB model limit and 100ms latency require
|
||||
- What order should you apply compression techniques? _____________
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Pruning Removes Unimportant Weights
|
||||
|
||||
**What you built:** Pruning that zeros out small weights, creating sparse models.
|
||||
|
||||
**Why it matters:** Most neural network weights are close to zero—and removing them barely
|
||||
affects accuracy! At 50% sparsity, half your weights are gone, but the model still works.
|
||||
This is how you make models faster and smaller without retraining.
|
||||
|
||||
Combined with quantization, pruning can shrink models 8× or more.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_compression():
|
||||
"""🎯 See pruning create sparsity."""
|
||||
print("🎯 AHA MOMENT: Pruning Removes Weights")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a model
|
||||
layer = Linear(128, 64)
|
||||
|
||||
original_nonzero = np.count_nonzero(layer.weight.data)
|
||||
original_total = layer.weight.data.size
|
||||
|
||||
# Apply 50% pruning
|
||||
Compressor.magnitude_prune(layer, sparsity=0.5)
|
||||
|
||||
pruned_nonzero = np.count_nonzero(layer.weight.data)
|
||||
sparsity = 1 - (pruned_nonzero / original_total)
|
||||
|
||||
print(f"Original: {original_nonzero:,} non-zero weights")
|
||||
print(f"After 50% pruning: {pruned_nonzero:,} non-zero weights")
|
||||
print(f"\nActual sparsity: {sparsity:.1%}")
|
||||
print(f"Half the weights are now zero!")
|
||||
|
||||
print("\n✨ Smaller weights removed—model still works!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_compression()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 MODULE SUMMARY: Compression
|
||||
|
||||
@@ -1665,6 +1665,54 @@ ChatGPT serves millions of users. Each user's conversation needs its own KV cach
|
||||
caches on disk and reload as needed (slower but cheaper)? What's the trade-off?
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: KV Cache Avoids Recomputation
|
||||
|
||||
**What you built:** A KV Cache that stores key-value pairs to avoid redundant attention computation.
|
||||
|
||||
**Why it matters:** When generating text token-by-token, naive attention recomputes the same
|
||||
K,V values for all previous tokens at each step. With KV caching, you compute once and reuse!
|
||||
This is why ChatGPT responds so fast—it's not recomputing everything every token.
|
||||
|
||||
This optimization turns O(n²) generation into O(n), enabling practical LLM deployment.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_memoization():
|
||||
"""🎯 See KV cache store and reuse values."""
|
||||
print("🎯 AHA MOMENT: KV Cache Avoids Recomputation")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a cache for 2-layer transformer
|
||||
# (batch=1, max_seq=100, layers=2, heads=4, head_dim=64)
|
||||
cache = KVCache(batch_size=1, max_seq_len=100, num_layers=2,
|
||||
num_heads=4, head_dim=64)
|
||||
|
||||
# Simulate generating 5 tokens one at a time
|
||||
print("Generating tokens and caching K,V pairs...")
|
||||
for token_idx in range(5):
|
||||
# For each new token, compute K,V (shape: batch, heads, 1, head_dim)
|
||||
new_k = Tensor(np.random.randn(1, 4, 1, 64))
|
||||
new_v = Tensor(np.random.randn(1, 4, 1, 64))
|
||||
|
||||
# Update cache for layer 0
|
||||
cache.update(0, new_k, new_v)
|
||||
cache.advance() # Move to next position
|
||||
|
||||
print(f"Cached K,V for {cache.seq_pos} tokens")
|
||||
|
||||
# Retrieve all cached values
|
||||
k_all, v_all = cache.get(0)
|
||||
print(f"Retrieved: K{k_all.shape}, V{v_all.shape}")
|
||||
|
||||
print("\n✨ Compute once, reuse forever—10× faster generation!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_memoization()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1420,6 +1420,56 @@ For edge deployment (memory critical, stability required, hardware diverse):
|
||||
- What's the primary constraint: memory, compute, or power? _____
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Vectorization and Fusion Speed Things Up
|
||||
|
||||
**What you built:** Vectorized operations and fused kernels that reduce memory traffic.
|
||||
|
||||
**Why it matters:** Individual operations like x + y + z require reading and writing memory
|
||||
multiple times. Fused operations like fused_gelu do everything in one pass! This reduces
|
||||
memory bandwidth by 60-80%, a huge win since memory is often the bottleneck.
|
||||
|
||||
Combined with vectorization (SIMD), these techniques make neural networks 2-5× faster.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_acceleration():
|
||||
"""🎯 See fused operations reduce memory traffic."""
|
||||
print("🎯 AHA MOMENT: Fusion Reduces Memory Traffic")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a tensor
|
||||
x = Tensor(np.random.randn(1000, 1000))
|
||||
|
||||
import time
|
||||
|
||||
# Unfused GELU (multiple operations)
|
||||
start = time.perf_counter()
|
||||
for _ in range(10):
|
||||
# Manual GELU: 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))
|
||||
t = x.data
|
||||
unfused = 0.5 * t * (1 + np.tanh(np.sqrt(2/np.pi) * (t + 0.044715 * t**3)))
|
||||
unfused_time = (time.perf_counter() - start) / 10
|
||||
|
||||
# Fused GELU (single operation)
|
||||
start = time.perf_counter()
|
||||
for _ in range(10):
|
||||
fused = fused_gelu(x)
|
||||
fused_time = (time.perf_counter() - start) / 10
|
||||
|
||||
print(f"Unfused GELU: {unfused_time*1000:.2f} ms")
|
||||
print(f"Fused GELU: {fused_time*1000:.2f} ms")
|
||||
print(f"\nSpeedup: {unfused_time/fused_time:.1f}×")
|
||||
|
||||
print("\n✨ Same result, fewer memory accesses!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_acceleration()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 MODULE SUMMARY: Acceleration
|
||||
|
||||
@@ -2289,8 +2289,55 @@ def test_module():
|
||||
print("🎉 ALL TESTS PASSED! Module ready for export.")
|
||||
print("Run: tito module complete 19")
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: Measurement Enables Optimization
|
||||
|
||||
**What you built:** A benchmarking system with warmup, statistics, and reproducibility.
|
||||
|
||||
**Why it matters:** "Premature optimization is the root of all evil"—but you can't optimize
|
||||
without measuring! Your benchmarking system produces reliable, comparable numbers: warmup
|
||||
iterations eliminate cold-start effects, multiple runs give confidence intervals.
|
||||
|
||||
This is how production ML teams make decisions: measure, compare, improve, repeat.
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_benchmarking():
|
||||
"""🎯 See professional benchmarking in action."""
|
||||
print("🎯 AHA MOMENT: Measurement Enables Optimization")
|
||||
print("=" * 45)
|
||||
|
||||
# Create a simple model and input
|
||||
layer = Linear(512, 256)
|
||||
x = Tensor(np.random.randn(32, 512))
|
||||
|
||||
# Benchmark with proper methodology
|
||||
benchmark = Benchmark(
|
||||
models=[layer],
|
||||
datasets=[(x, None)],
|
||||
warmup_iterations=3,
|
||||
measurement_iterations=10
|
||||
)
|
||||
|
||||
results = benchmark.run()
|
||||
result = results[0]
|
||||
|
||||
print(f"Model: Linear(512 → 256)")
|
||||
print(f"Batch: 32 samples")
|
||||
print(f"\nBenchmark Results (10 iterations):")
|
||||
print(f" Mean latency: {result.mean*1000:.2f} ms")
|
||||
print(f" Std dev: {result.std*1000:.2f} ms")
|
||||
print(f" Min: {result.min*1000:.2f} ms")
|
||||
print(f" Max: {result.max*1000:.2f} ms")
|
||||
|
||||
print("\n✨ Reliable measurements guide optimization decisions!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_benchmarking()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -1531,10 +1531,49 @@ print("✅ Test module defined")
|
||||
When run as a script, this demonstrates the complete workflow.
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "main", "solution": true}
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Aha Moment: You Built a Complete ML System
|
||||
|
||||
**What you built:** A professional benchmarking and submission system for your TinyTorch models.
|
||||
|
||||
**Why it matters:** You've gone from raw tensors to complete ML systems! Your capstone ties
|
||||
together everything: models, training, optimization, profiling, and benchmarking. The
|
||||
submission format you created is how real ML competitions and production deployments work.
|
||||
|
||||
Congratulations—you've built a deep learning framework from scratch!
|
||||
"""
|
||||
|
||||
# %%
|
||||
def demo_capstone():
|
||||
"""🎯 See your complete system come together."""
|
||||
print("🎯 AHA MOMENT: You Built a Complete ML System")
|
||||
print("=" * 45)
|
||||
|
||||
print("📚 Your TinyTorch Journey:")
|
||||
print()
|
||||
print(" Modules 01-08: Foundation")
|
||||
print(" Tensor → Activations → Layers → Losses")
|
||||
print(" → Autograd → Optimizers → Training → DataLoader")
|
||||
print()
|
||||
print(" Modules 09-13: Neural Architectures")
|
||||
print(" Conv2d → Tokenization → Embeddings")
|
||||
print(" → Attention → Transformers")
|
||||
print()
|
||||
print(" Modules 14-19: Production Optimization")
|
||||
print(" Profiling → Quantization → Compression")
|
||||
print(" → KV Caching → Acceleration → Benchmarking")
|
||||
print()
|
||||
print(" Module 20: Capstone")
|
||||
print(" Complete benchmarking and submission system")
|
||||
print()
|
||||
print("✨ From np.array to production ML—congratulations!")
|
||||
|
||||
# %%
|
||||
if __name__ == "__main__":
|
||||
# Run the test module to validate everything works
|
||||
test_module()
|
||||
print("\n")
|
||||
demo_capstone()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
|
||||
@@ -129,7 +129,7 @@ def test_dense_with_tensor():
|
||||
assert layer.weight.shape == (10, 5), "Weight shape should match layer dims"
|
||||
# Bias may or may not exist depending on implementation
|
||||
if hasattr(layer, 'bias') and layer.bias is not None:
|
||||
assert isinstance(layer.bias, Tensor), "Bias should be Tensor"
|
||||
assert isinstance(layer.bias, Tensor), "Bias should be Tensor"
|
||||
|
||||
|
||||
def test_dense_with_activations():
|
||||
|
||||
@@ -112,16 +112,16 @@ def test_optimizer_with_mse_loss():
|
||||
optimizer = SGD(layer.parameters(), lr=0.01)
|
||||
loss_fn = MSELoss()
|
||||
|
||||
# Forward pass
|
||||
# Forward pass
|
||||
x = Tensor(np.random.randn(4, 3), requires_grad=True)
|
||||
target = Tensor(np.random.randn(4, 1))
|
||||
output = layer(x)
|
||||
loss = loss_fn(output, target)
|
||||
|
||||
# Backward and update
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
print("✅ Optimizer integrates with MSE loss!")
|
||||
|
||||
@@ -148,9 +148,9 @@ def test_optimizer_with_activations():
|
||||
"Sigmoid should output in [0, 1]"
|
||||
|
||||
loss = output.sum()
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
print("✅ Optimizer works with activation functions!")
|
||||
|
||||
@@ -226,7 +226,7 @@ def test_unit_shape_manipulation():
|
||||
# Valid reshape
|
||||
reshaped = t.reshape(2, 3)
|
||||
assert reshaped.shape == (2, 3)
|
||||
|
||||
|
||||
# Invalid reshape should raise
|
||||
try:
|
||||
t.reshape(2, 2) # 6 elements can't fit in 2×2=4
|
||||
|
||||
Reference in New Issue
Block a user