Release preparation: fix package exports, tests, and documentation

Package exports:
- Fix tinytorch/__init__.py to export all required components for milestones
- Add Dense as alias for Linear for compatibility
- Add loss functions (MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss)
- Export spatial operations, data loaders, and transformer components

Test infrastructure:
- Create tests/conftest.py to handle path setup
- Create tests/test_utils.py with shared test utilities
- Rename test_progressive_integration.py files to include module number
- Fix syntax errors in test files (spaces in class names)
- Remove stale test file referencing non-existent modules

Documentation:
- Update README.md with correct milestone file names
- Fix milestone requirements to match actual module dependencies

Export system:
- Run tito export --all to regenerate package from source modules
- Ensure all 20 modules are properly exported
This commit is contained in:
Vijay Janapa Reddi
2025-12-02 14:19:56 -05:00
parent 4b22d229d4
commit bd7fcb2177
42 changed files with 1526 additions and 393 deletions

View File

@@ -317,26 +317,28 @@ tito module complete 01
As you complete modules, unlock historical ML milestones demonstrating YOUR implementations:
### 🧠 01. Perceptron (1957) - After Module 03
### 🧠 01. Perceptron (1957) - After Module 07
```bash
cd milestones/01_1957_perceptron
python perceptron_trained.py
python 01_rosenblatt_forward.py # Forward pass demo (after Module 03)
python 02_rosenblatt_trained.py # Training demo (after Module 07)
# Rosenblatt's first trainable neural network
# YOUR Linear layer + Sigmoid recreates history!
```
**Requirements**: Modules 01-03 (Tensor, Activations, Layers)
**Requirements**: Modules 01-07 (Tensor through Training)
**Achievement**: Binary classification with gradient descent
---
### ⚡ 02. XOR Crisis (1969) - After Module 05
### ⚡ 02. XOR Crisis (1969) - After Module 07
```bash
cd milestones/02_1969_xor_crisis
python xor_solved.py
cd milestones/02_1969_xor
python 01_xor_crisis.py # Demonstrate the problem
python 02_xor_solved.py # Solve with hidden layers!
# Solve Minsky's XOR challenge with hidden layers
# YOUR autograd enables multi-layer learning!
```
**Requirements**: Modules 01-05 (+ Autograd)
**Requirements**: Modules 01-07 (Tensor through Training)
**Achievement**: Non-linear problem solving
---

View File

@@ -14,7 +14,7 @@ from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
class TestModule01Still Working:
class TestModule01StillWorking:
"""Verify Module 01 (Setup) functionality is still intact."""
def test_setup_environment_stable(self):
@@ -88,7 +88,7 @@ class TestModule03ActivationsCore:
assert True, "Module 02: Sigmoid not implemented yet"
class TestProgressive StackIntegration:
class TestProgressiveStackIntegration:
"""Test that the full stack (01→02→03) works together."""
def test_tensor_activation_pipeline(self):

View File

@@ -1,334 +0,0 @@
"""
Integration Tests - Layers and Dense Networks
Tests cross-module interfaces and compatibility between individual Layers and Dense Network modules.
Focuses on integration, not re-testing individual module functionality.
"""
import pytest
import numpy as np
from test_utils import setup_integration_test
# Ensure proper setup before importing
setup_integration_test()
# Import ONLY from TinyTorch package
from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Dense
from tinytorch.core.dense import Sequential, create_mlp, MLP
from tinytorch.core.activations import ReLU, Sigmoid, Tanh
class TestLayersDenseNetworkInterface:
"""Test interface compatibility between individual Layers and Dense Networks."""
def test_dense_layer_to_sequential_network(self):
"""Test that Dense layers can be integrated into Sequential networks."""
# Create individual dense layers
layer1 = Dense(input_size=4, output_size=8)
layer2 = Dense(input_size=8, output_size=3)
# Test integration into Sequential
network = Sequential([layer1, ReLU(), layer2])
# Test interface compatibility
x = Tensor(np.random.randn(2, 4))
result = network(x)
# Verify integration works
assert isinstance(result, Tensor), "Sequential should work with Dense layers"
assert result.shape == (2, 3), "Sequential should process through all layers"
def test_dense_layer_compatibility_with_mlp(self):
"""Test that Dense layers are compatible with MLP construction."""
# Test that MLP uses same interface as individual Dense layers
individual_layer = Dense(input_size=6, output_size=10)
mlp_network = create_mlp(input_size=6, hidden_sizes=[10], output_size=3)
# Test same input works with both
x = Tensor(np.random.randn(1, 6))
# Individual layer output
layer_output = individual_layer(x)
# MLP output (should accept same input)
mlp_output = mlp_network(x)
# Verify interface compatibility
assert isinstance(layer_output, Tensor), "Dense layer should return Tensor"
assert isinstance(mlp_output, Tensor), "MLP should return Tensor"
assert layer_output.shape == (1, 10), "Dense layer should have expected output shape"
assert mlp_output.shape == (1, 3), "MLP should have expected output shape"
def test_layer_output_as_network_input(self):
"""Test that Dense layer output can be used as network input."""
# Create preprocessing layer
preprocessor = Dense(input_size=5, output_size=8)
# Create network that processes preprocessor output
network = Sequential([
Dense(input_size=8, output_size=12),
ReLU(),
Dense(input_size=12, output_size=4)
])
# Test pipeline: input → layer → network
x = Tensor(np.random.randn(3, 5))
preprocessed = preprocessor(x)
final_output = network(preprocessed)
# Verify pipeline interface
assert isinstance(preprocessed, Tensor), "Layer should produce Tensor for network"
assert isinstance(final_output, Tensor), "Network should accept layer output"
assert final_output.shape == (3, 4), "Pipeline should work end-to-end"
def test_network_layer_composition(self):
"""Test that networks can be composed with individual layers."""
# Create base network
base_network = create_mlp(input_size=4, hidden_sizes=[6], output_size=8)
# Add additional processing layer
final_layer = Dense(input_size=8, output_size=2)
# Test composition
x = Tensor(np.random.randn(2, 4))
# Pipeline: input → network → layer
network_output = base_network(x)
final_output = final_layer(network_output)
# Verify composition interface
assert isinstance(network_output, Tensor), "Network should produce Tensor for layer"
assert isinstance(final_output, Tensor), "Layer should accept network output"
assert network_output.shape == (2, 8), "Network output should have expected shape"
assert final_output.shape == (2, 2), "Layer should process network output correctly"
class TestLayerNetworkDataFlow:
"""Test data flow compatibility between layers and networks."""
def test_shape_preservation_across_layer_network_boundary(self):
"""Test shape preservation when crossing layer-network boundaries."""
shape_configs = [
(1, 4, 8, 2), # Single sample
(5, 6, 10, 3), # Small batch
(10, 8, 16, 4), # Larger batch
]
for batch_size, input_size, hidden_size, output_size in shape_configs:
# Create layer and network
layer = Dense(input_size=input_size, output_size=hidden_size)
network = Sequential([
Dense(input_size=hidden_size, output_size=hidden_size),
ReLU(),
Dense(input_size=hidden_size, output_size=output_size)
])
# Test data flow
x = Tensor(np.random.randn(batch_size, input_size))
layer_out = layer(x)
network_out = network(layer_out)
# Verify shape flow
assert layer_out.shape == (batch_size, hidden_size), f"Layer should output correct shape for config {shape_configs}"
assert network_out.shape == (batch_size, output_size), f"Network should output correct shape for config {shape_configs}"
def test_dtype_preservation_across_layer_network_boundary(self):
"""Test data type preservation across layer-network boundaries."""
# Test float32 flow
layer_f32 = Dense(input_size=4, output_size=6)
network_f32 = create_mlp(input_size=6, hidden_sizes=[8], output_size=2)
x_f32 = Tensor(np.random.randn(2, 4).astype(np.float32))
layer_out_f32 = layer_f32(x_f32)
network_out_f32 = network_f32(layer_out_f32)
# Verify dtype preservation
assert layer_out_f32.dtype == np.float32, "Layer should preserve float32"
assert network_out_f32.dtype == np.float32, "Network should preserve float32 from layer"
# Test float64 flow
layer_f64 = Dense(input_size=4, output_size=6)
network_f64 = create_mlp(input_size=6, hidden_sizes=[8], output_size=2)
x_f64 = Tensor(np.random.randn(2, 4).astype(np.float64))
layer_out_f64 = layer_f64(x_f64)
network_out_f64 = network_f64(layer_out_f64)
# Verify dtype preservation
assert layer_out_f64.dtype == np.float64, "Layer should preserve float64"
assert network_out_f64.dtype == np.float64, "Network should preserve float64 from layer"
def test_error_handling_at_layer_network_boundary(self):
"""Test error handling when layer-network interfaces are incompatible."""
# Create mismatched layer and network
layer = Dense(input_size=4, output_size=6)
mismatched_network = Sequential([Dense(input_size=8, output_size=2)]) # Expects 8, gets 6
x = Tensor(np.random.randn(1, 4))
layer_output = layer(x) # Shape (1, 6)
# Should fail gracefully with dimension mismatch
try:
result = mismatched_network(layer_output) # Expects (1, 8)
assert False, "Should have failed with dimension mismatch"
except (ValueError, AssertionError, TypeError) as e:
# Expected behavior
assert isinstance(e, (ValueError, AssertionError, TypeError)), "Should fail gracefully with dimension mismatch"
class TestLayerNetworkSystemIntegration:
"""Test system-level integration scenarios with layers and networks."""
def test_multi_stage_processing_pipeline(self):
"""Test multi-stage processing using layers and networks."""
# Stage 1: Preprocessing layer
preprocessor = Dense(input_size=8, output_size=12)
# Stage 2: Feature extraction network
feature_extractor = Sequential([
Dense(input_size=12, output_size=16),
ReLU(),
Dense(input_size=16, output_size=10)
])
# Stage 3: Classification layer
classifier = Dense(input_size=10, output_size=3)
# Test complete pipeline
x = Tensor(np.random.randn(4, 8))
preprocessed = preprocessor(x)
features = feature_extractor(preprocessed)
predictions = classifier(features)
# Verify multi-stage integration
assert isinstance(preprocessed, Tensor), "Preprocessor should output Tensor"
assert isinstance(features, Tensor), "Feature extractor should output Tensor"
assert isinstance(predictions, Tensor), "Classifier should output Tensor"
assert predictions.shape == (4, 3), "Pipeline should produce expected final shape"
def test_parallel_layer_processing(self):
"""Test parallel processing with multiple layers feeding into network."""
# Create parallel processing layers
branch1 = Dense(input_size=6, output_size=4)
branch2 = Dense(input_size=6, output_size=4)
branch3 = Dense(input_size=6, output_size=4)
# Fusion network
fusion_network = Sequential([
Dense(input_size=12, output_size=8), # 4+4+4=12 from parallel branches
ReLU(),
Dense(input_size=8, output_size=2)
])
# Test parallel processing
x = Tensor(np.random.randn(2, 6))
# Process in parallel
out1 = branch1(x)
out2 = branch2(x)
out3 = branch3(x)
# Manually concatenate (simulating fusion)
# In a real implementation, this would be handled by a concatenation layer
fused_data = np.concatenate([out1.data, out2.data, out3.data], axis=1)
fused_tensor = Tensor(fused_data)
# Final processing
final_output = fusion_network(fused_tensor)
# Verify parallel processing integration
assert out1.shape == (2, 4), "Branch 1 should output correct shape"
assert out2.shape == (2, 4), "Branch 2 should output correct shape"
assert out3.shape == (2, 4), "Branch 3 should output correct shape"
assert fused_tensor.shape == (2, 12), "Fusion should combine all branches"
assert final_output.shape == (2, 2), "Final network should process fused input"
def test_layer_network_modularity(self):
"""Test that layers and networks can be replaced modularly."""
# Create modular components
input_processors = [
Dense(input_size=5, output_size=8),
Dense(input_size=5, output_size=8), # Different instance
]
core_networks = [
create_mlp(input_size=8, hidden_sizes=[10], output_size=6),
Sequential([Dense(input_size=8, output_size=6)]), # Different architecture
]
output_processors = [
Dense(input_size=6, output_size=3),
Dense(input_size=6, output_size=3), # Different instance
]
# Test all combinations work
x = Tensor(np.random.randn(1, 5))
for input_proc in input_processors:
for core_net in core_networks:
for output_proc in output_processors:
# Test modular pipeline
intermediate1 = input_proc(x)
intermediate2 = core_net(intermediate1)
final = output_proc(intermediate2)
# Verify modularity
assert isinstance(final, Tensor), "Modular combination should work"
assert final.shape == (1, 3), "Modular combination should produce expected output"
class TestLayerNetworkInterfaceStandards:
"""Test that layers and networks follow consistent interface standards."""
def test_consistent_call_interface(self):
"""Test that layers and networks have consistent callable interface."""
# Create different components
components = [
Dense(input_size=4, output_size=6),
Sequential([Dense(input_size=4, output_size=6)]),
create_mlp(input_size=4, hidden_sizes=[8], output_size=6),
MLP([4, 8, 6])
]
x = Tensor(np.random.randn(1, 4))
# Test all components have consistent interface
for component in components:
# Should be callable with same signature
result = component(x)
# Verify consistent interface
assert isinstance(result, Tensor), f"{type(component).__name__} should return Tensor"
assert result.shape[0] == 1, f"{type(component).__name__} should preserve batch dimension"
assert result.shape[1] == 6, f"{type(component).__name__} should produce expected output size"
def test_component_property_consistency(self):
"""Test that layers and networks have consistent properties."""
# Create components
layer = Dense(input_size=3, output_size=5)
network = Sequential([Dense(input_size=3, output_size=5)])
mlp = create_mlp(input_size=3, hidden_sizes=[], output_size=5)
# Test that all components can be used interchangeably
x = Tensor(np.random.randn(2, 3))
results = []
for component in [layer, network, mlp]:
result = component(x)
results.append(result)
# Verify consistent interface properties
assert hasattr(result, 'shape'), f"{type(component).__name__} result should have shape"
assert hasattr(result, 'data'), f"{type(component).__name__} result should have data"
assert hasattr(result, 'dtype'), f"{type(component).__name__} result should have dtype"
# All should produce same output shape
expected_shape = (2, 5)
for i, result in enumerate(results):
assert result.shape == expected_shape, f"Component {i} should produce consistent shape"
if __name__ == "__main__":
pytest.main([__file__])

29
tests/conftest.py Normal file
View File

@@ -0,0 +1,29 @@
"""
Pytest configuration for TinyTorch tests.
This file is automatically loaded by pytest and sets up the test environment.
"""
import sys
import os
from pathlib import Path
# Add tests directory to Python path so test_utils can be imported
tests_dir = Path(__file__).parent
if str(tests_dir) not in sys.path:
sys.path.insert(0, str(tests_dir))
# Add project root to Python path
project_root = tests_dir.parent
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
# Set quiet mode for tinytorch imports during tests
os.environ['TINYTORCH_QUIET'] = '1'
# Import test utilities to make them available
try:
from test_utils import setup_integration_test, create_test_tensor, assert_tensors_close
except ImportError:
pass # test_utils not yet created or has issues

114
tests/test_utils.py Normal file
View File

@@ -0,0 +1,114 @@
"""
TinyTorch Test Utilities
Shared utilities for integration tests across all modules.
Provides setup functions and common test helpers.
"""
import sys
import os
from pathlib import Path
def setup_integration_test():
"""
Set up the environment for integration testing.
This function ensures:
1. The TinyTorch package is importable
2. NumPy random seed is set for reproducibility
3. Warning filters are set appropriately
Call this at the top of integration test files before importing TinyTorch.
"""
import warnings
import numpy as np
# Ensure tinytorch is on the path (from project root)
project_root = Path(__file__).parent.parent
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
# Set random seed for reproducibility
np.random.seed(42)
# Suppress certain warnings during tests
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
# Set quiet mode for tinytorch imports during tests
os.environ['TINYTORCH_QUIET'] = '1'
def get_project_root() -> Path:
"""Return the project root directory."""
return Path(__file__).parent.parent
def get_test_data_path() -> Path:
"""Return the path to test data directory."""
return get_project_root() / "datasets"
def create_test_tensor(shape, requires_grad=True, seed=None):
"""
Create a test tensor with random data.
Args:
shape: Tuple specifying tensor shape
requires_grad: Whether tensor should track gradients
seed: Optional random seed for reproducibility
Returns:
Tensor with random data
"""
import numpy as np
from tinytorch.core.tensor import Tensor
if seed is not None:
np.random.seed(seed)
data = np.random.randn(*shape).astype(np.float32)
return Tensor(data, requires_grad=requires_grad)
def assert_tensors_close(t1, t2, rtol=1e-5, atol=1e-8, msg=""):
"""
Assert that two tensors are element-wise close.
Args:
t1: First tensor
t2: Second tensor
rtol: Relative tolerance
atol: Absolute tolerance
msg: Optional message for assertion error
"""
import numpy as np
# Extract data from tensors if needed
data1 = t1.data if hasattr(t1, 'data') else t1
data2 = t2.data if hasattr(t2, 'data') else t2
if not np.allclose(data1, data2, rtol=rtol, atol=atol):
diff = np.abs(data1 - data2)
max_diff = np.max(diff)
raise AssertionError(
f"Tensors not close (max diff: {max_diff:.6e}). {msg}"
)
def assert_gradients_exist(tensor, msg=""):
"""Assert that a tensor has computed gradients."""
if tensor.grad is None:
raise AssertionError(f"Tensor has no gradients. {msg}")
def skip_if_no_tinytorch():
"""Pytest skip decorator for when tinytorch isn't available."""
import pytest
try:
import tinytorch
return pytest.mark.skipif(False, reason="TinyTorch available")
except ImportError:
return pytest.mark.skip(reason="TinyTorch not installed")

99
tinytorch/__init__.py generated
View File

@@ -1,28 +1,97 @@
"""
TinyTorch - Build ML Systems From First Principles
A complete educational ML framework for learning neural network internals
by implementing everything from scratch.
Top-level exports provide easy access to commonly used components.
For advanced modules (optimization, profiling), import from submodules:
from tinytorch.profiling.profiler import Profiler
from tinytorch.optimization.quantization import quantize_int8
from tinytorch.generation.kv_cache import enable_kv_cache
"""
__version__ = "0.1.0"
# Import core functionality
from . import core
# Make common components easily accessible at top level
# ============================================================================
# Core Functionality (Modules 01-07)
# ============================================================================
from .core.tensor import Tensor
from .core.layers import Linear, Dropout
from .core.activations import Sigmoid, ReLU, Tanh, GELU, Softmax
# from .core.losses import MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss # TEMP: removed for testing
from .core.optimizers import SGD, AdamW
from .core.layers import Layer, Linear, Dense, Dropout
from .core.losses import MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss
from .core.optimizers import SGD, Adam, AdamW
from .core.training import Trainer, CosineSchedule, clip_grad_norm
# 🔥 CRITICAL: Enable automatic differentiation
# This patches Tensor operations to track gradients
# Use quiet=True when imported by CLI tools to avoid cluttering output
# ============================================================================
# Data Loading (Module 08)
# ============================================================================
from .data.loader import Dataset, TensorDataset, DataLoader
# ============================================================================
# Spatial Operations (Module 09)
# ============================================================================
from .core.spatial import Conv2d, MaxPool2d
# ============================================================================
# Text Processing (Modules 10-11)
# ============================================================================
from .text.tokenization import Tokenizer, CharTokenizer, BPETokenizer
from .text.embeddings import Embedding, PositionalEncoding, EmbeddingLayer
# ============================================================================
# Attention & Transformers (Modules 12-13)
# ============================================================================
from .core.attention import MultiHeadAttention, scaled_dot_product_attention
from .models.transformer import LayerNorm, MLP, TransformerBlock, GPT
# ============================================================================
# Enable Autograd (CRITICAL - must happen after imports)
# ============================================================================
import os
from .core.autograd import enable_autograd
# Enable autograd quietly when imported by CLI tools
enable_autograd(quiet=os.environ.get('TINYTORCH_QUIET', '').lower() in ('1', 'true', 'yes'))
# Export main public API
# ============================================================================
# Public API
# ============================================================================
__all__ = [
'core',
# Version
'__version__',
# Core - Tensor
'Tensor',
'Linear', 'Dropout',
# Core - Activations
'Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax',
# 'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss', # TEMP: removed for testing
'SGD', 'AdamW'
# Core - Layers
'Layer', 'Linear', 'Dense', 'Dropout',
# Core - Losses
'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss',
# Core - Optimizers
'SGD', 'Adam', 'AdamW',
# Core - Training
'Trainer', 'CosineSchedule', 'clip_grad_norm',
# Data Loading
'Dataset', 'TensorDataset', 'DataLoader',
# Core - Spatial (CNN)
'Conv2d', 'MaxPool2d',
# Text/NLP
'Tokenizer', 'CharTokenizer', 'BPETokenizer',
'Embedding', 'PositionalEncoding', 'EmbeddingLayer',
# Core - Attention
'MultiHeadAttention', 'scaled_dot_product_attention',
# Models
'LayerNorm', 'MLP', 'TransformerBlock', 'GPT',
]

98
tinytorch/_modidx.py generated
View File

@@ -63,6 +63,14 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.Benchmark.run_memory_benchmark': ( '19_benchmarking/benchmarking.html#benchmark.run_memory_benchmark',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.BenchmarkResult': ( '19_benchmarking/benchmarking.html#benchmarkresult',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.BenchmarkResult.__post_init__': ( '19_benchmarking/benchmarking.html#benchmarkresult.__post_init__',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.BenchmarkResult.__str__': ( '19_benchmarking/benchmarking.html#benchmarkresult.__str__',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.BenchmarkResult.to_dict': ( '19_benchmarking/benchmarking.html#benchmarkresult.to_dict',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.BenchmarkSuite': ( '19_benchmarking/benchmarking.html#benchmarksuite',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.BenchmarkSuite.__init__': ( '19_benchmarking/benchmarking.html#benchmarksuite.__init__',
@@ -89,10 +97,33 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.test_unit_benchmark': ( '19_benchmarking/benchmarking.html#test_unit_benchmark',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.test_unit_benchmark_result': ( '19_benchmarking/benchmarking.html#test_unit_benchmark_result',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.test_unit_benchmark_suite': ( '19_benchmarking/benchmarking.html#test_unit_benchmark_suite',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.test_unit_tinymlperf': ( '19_benchmarking/benchmarking.html#test_unit_tinymlperf',
'tinytorch/benchmarking/benchmark.py')},
'tinytorch.capstone': { 'tinytorch.capstone.BenchmarkReport': ( '20_capstone/capstone.html#benchmarkreport',
'tinytorch/capstone.py'),
'tinytorch.capstone.BenchmarkReport.__init__': ( '20_capstone/capstone.html#benchmarkreport.__init__',
'tinytorch/capstone.py'),
'tinytorch.capstone.BenchmarkReport._get_system_info': ( '20_capstone/capstone.html#benchmarkreport._get_system_info',
'tinytorch/capstone.py'),
'tinytorch.capstone.BenchmarkReport.benchmark_model': ( '20_capstone/capstone.html#benchmarkreport.benchmark_model',
'tinytorch/capstone.py'),
'tinytorch.capstone.SimpleMLP': ('20_capstone/capstone.html#simplemlp', 'tinytorch/capstone.py'),
'tinytorch.capstone.SimpleMLP.__init__': ( '20_capstone/capstone.html#simplemlp.__init__',
'tinytorch/capstone.py'),
'tinytorch.capstone.SimpleMLP.count_parameters': ( '20_capstone/capstone.html#simplemlp.count_parameters',
'tinytorch/capstone.py'),
'tinytorch.capstone.SimpleMLP.forward': ( '20_capstone/capstone.html#simplemlp.forward',
'tinytorch/capstone.py'),
'tinytorch.capstone.SimpleMLP.parameters': ( '20_capstone/capstone.html#simplemlp.parameters',
'tinytorch/capstone.py'),
'tinytorch.capstone.generate_submission': ( '20_capstone/capstone.html#generate_submission',
'tinytorch/capstone.py'),
'tinytorch.capstone.save_submission': ( '20_capstone/capstone.html#save_submission',
'tinytorch/capstone.py')},
'tinytorch.competition.submit': { 'tinytorch.competition.submit.generate_baseline': ( 'source/20_competition/competition_dev.html#generate_baseline',
'tinytorch/competition/submit.py'),
'tinytorch.competition.submit.generate_submission': ( 'source/20_competition/competition_dev.html#generate_submission',
@@ -115,6 +146,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.GELU.forward': ( '02_activations/activations.html#gelu.forward',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.GELU.parameters': ( '02_activations/activations.html#gelu.parameters',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.ReLU': ( '02_activations/activations.html#relu',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.ReLU.__call__': ( '02_activations/activations.html#relu.__call__',
@@ -123,6 +156,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.ReLU.forward': ( '02_activations/activations.html#relu.forward',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.ReLU.parameters': ( '02_activations/activations.html#relu.parameters',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Sigmoid': ( '02_activations/activations.html#sigmoid',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Sigmoid.__call__': ( '02_activations/activations.html#sigmoid.__call__',
@@ -131,6 +166,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Sigmoid.forward': ( '02_activations/activations.html#sigmoid.forward',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Sigmoid.parameters': ( '02_activations/activations.html#sigmoid.parameters',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Softmax': ( '02_activations/activations.html#softmax',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Softmax.__call__': ( '02_activations/activations.html#softmax.__call__',
@@ -139,6 +176,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Softmax.forward': ( '02_activations/activations.html#softmax.forward',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Softmax.parameters': ( '02_activations/activations.html#softmax.parameters',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Tanh': ( '02_activations/activations.html#tanh',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Tanh.__call__': ( '02_activations/activations.html#tanh.__call__',
@@ -146,7 +185,9 @@ d = { 'settings': { 'branch': 'main',
'tinytorch.core.activations.Tanh.backward': ( '02_activations/activations.html#tanh.backward',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Tanh.forward': ( '02_activations/activations.html#tanh.forward',
'tinytorch/core/activations.py')},
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Tanh.parameters': ( '02_activations/activations.html#tanh.parameters',
'tinytorch/core/activations.py')},
'tinytorch.core.attention': { 'tinytorch.core.attention.MultiHeadAttention': ( '12_attention/attention.html#multiheadattention',
'tinytorch/core/attention.py'),
'tinytorch.core.attention.MultiHeadAttention.__call__': ( '12_attention/attention.html#multiheadattention.__call__',
@@ -264,6 +305,20 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.AvgPool2d.parameters': ( '09_spatial/spatial.html#avgpool2d.parameters',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.BatchNorm2d': ( '09_spatial/spatial.html#batchnorm2d',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.BatchNorm2d.__call__': ( '09_spatial/spatial.html#batchnorm2d.__call__',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.BatchNorm2d.__init__': ( '09_spatial/spatial.html#batchnorm2d.__init__',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.BatchNorm2d.eval': ( '09_spatial/spatial.html#batchnorm2d.eval',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.BatchNorm2d.forward': ( '09_spatial/spatial.html#batchnorm2d.forward',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.BatchNorm2d.parameters': ( '09_spatial/spatial.html#batchnorm2d.parameters',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.BatchNorm2d.train': ( '09_spatial/spatial.html#batchnorm2d.train',
'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.Conv2d': ('09_spatial/spatial.html#conv2d', 'tinytorch/core/spatial.py'),
'tinytorch.core.spatial.Conv2d.__call__': ( '09_spatial/spatial.html#conv2d.__call__',
'tinytorch/core/spatial.py'),
@@ -367,8 +422,16 @@ d = { 'settings': { 'branch': 'main',
'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training.html#trainer.save_checkpoint',
'tinytorch/core/training.py'),
'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training.html#trainer.train_epoch',
'tinytorch/core/training.py')},
'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader.html#dataloader',
'tinytorch/core/training.py'),
'tinytorch.core.training.clip_grad_norm': ( '07_training/training.html#clip_grad_norm',
'tinytorch/core/training.py')},
'tinytorch.data.loader': { 'tinytorch.data.loader.Compose': ( '08_dataloader/dataloader.html#compose',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.Compose.__call__': ( '08_dataloader/dataloader.html#compose.__call__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.Compose.__init__': ( '08_dataloader/dataloader.html#compose.__init__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader.html#dataloader',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader.html#dataloader.__init__',
'tinytorch/data/loader.py'),
@@ -384,6 +447,18 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.Dataset.__len__': ( '08_dataloader/dataloader.html#dataset.__len__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.RandomCrop': ( '08_dataloader/dataloader.html#randomcrop',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.RandomCrop.__call__': ( '08_dataloader/dataloader.html#randomcrop.__call__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.RandomCrop.__init__': ( '08_dataloader/dataloader.html#randomcrop.__init__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.RandomHorizontalFlip': ( '08_dataloader/dataloader.html#randomhorizontalflip',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.RandomHorizontalFlip.__call__': ( '08_dataloader/dataloader.html#randomhorizontalflip.__call__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.RandomHorizontalFlip.__init__': ( '08_dataloader/dataloader.html#randomhorizontalflip.__init__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.TensorDataset': ( '08_dataloader/dataloader.html#tensordataset',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.TensorDataset.__getitem__': ( '08_dataloader/dataloader.html#tensordataset.__getitem__',
@@ -406,6 +481,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/generation/kv_cache.py'),
'tinytorch.generation.kv_cache.KVCache.update': ( '17_memoization/memoization.html#kvcache.update',
'tinytorch/generation/kv_cache.py'),
'tinytorch.generation.kv_cache.create_kv_cache': ( '17_memoization/memoization.html#create_kv_cache',
'tinytorch/generation/kv_cache.py'),
'tinytorch.generation.kv_cache.disable_kv_cache': ( '17_memoization/memoization.html#disable_kv_cache',
'tinytorch/generation/kv_cache.py'),
'tinytorch.generation.kv_cache.enable_kv_cache': ( '17_memoization/memoization.html#enable_kv_cache',
@@ -454,7 +531,12 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers.html#transformerblock.parameters',
'tinytorch/models/transformer.py')},
'tinytorch.optimization.acceleration': {},
'tinytorch.optimization.acceleration': { 'tinytorch.optimization.acceleration.fused_gelu': ( '18_acceleration/acceleration.html#fused_gelu',
'tinytorch/optimization/acceleration.py'),
'tinytorch.optimization.acceleration.tiled_matmul': ( '18_acceleration/acceleration.html#tiled_matmul',
'tinytorch/optimization/acceleration.py'),
'tinytorch.optimization.acceleration.vectorized_matmul': ( '18_acceleration/acceleration.html#vectorized_matmul',
'tinytorch/optimization/acceleration.py')},
'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.CompressionComplete': ( '16_compression/compression.html#compressioncomplete',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.compress_model': ( '16_compression/compression.html#compressioncomplete.compress_model',
@@ -479,6 +561,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.compress_model': ( '16_compression/compression.html#compress_model',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.low_rank_approximate': ( '16_compression/compression.html#low_rank_approximate',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.magnitude_prune': ( '16_compression/compression.html#magnitude_prune',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.measure_sparsity': ( '16_compression/compression.html#measure_sparsity',
@@ -515,6 +599,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.SimpleModel.forward': ( '15_quantization/quantization.html#simplemodel.forward',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.compare_model_sizes': ( '15_quantization/quantization.html#compare_model_sizes',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.dequantize_int8': ( '15_quantization/quantization.html#dequantize_int8',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.quantize_int8': ( '15_quantization/quantization.html#quantize_int8',
@@ -578,7 +664,9 @@ d = { 'settings': { 'branch': 'main',
'tinytorch.text.embeddings.PositionalEncoding.forward': ( '11_embeddings/embeddings.html#positionalencoding.forward',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.parameters': ( '11_embeddings/embeddings.html#positionalencoding.parameters',
'tinytorch/text/embeddings.py')},
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.create_sinusoidal_embeddings': ( '11_embeddings/embeddings.html#create_sinusoidal_embeddings',
'tinytorch/text/embeddings.py')},
'tinytorch.text.tokenization': { 'tinytorch.text.tokenization.BPETokenizer': ( '10_tokenization/tokenization.html#bpetokenizer',
'tinytorch/text/tokenization.py'),
'tinytorch.text.tokenization.BPETokenizer.__init__': ( '10_tokenization/tokenization.html#bpetokenizer.__init__',

View File

@@ -15,14 +15,116 @@
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['DEFAULT_WARMUP_RUNS', 'DEFAULT_MEASUREMENT_RUNS', 'Benchmark', 'test_unit_benchmark', 'BenchmarkSuite',
'test_unit_benchmark_suite', 'TinyMLPerf', 'test_unit_tinymlperf']
__all__ = ['DEFAULT_WARMUP_RUNS', 'DEFAULT_MEASUREMENT_RUNS', 'BenchmarkResult', 'test_unit_benchmark_result', 'Benchmark',
'test_unit_benchmark', 'BenchmarkSuite', 'test_unit_benchmark_suite', 'TinyMLPerf', 'test_unit_tinymlperf']
# %% ../../modules/19_benchmarking/19_benchmarking.ipynb 0
# Constants for benchmarking defaults
DEFAULT_WARMUP_RUNS = 5 # Default warmup runs for JIT compilation and cache warming
DEFAULT_MEASUREMENT_RUNS = 10 # Default measurement runs for statistical significance
# %% ../../modules/19_benchmarking/19_benchmarking.ipynb 9
@dataclass
class BenchmarkResult:
"""
Container for benchmark measurements with statistical analysis.
TODO: Implement a robust result container that stores measurements and metadata
APPROACH:
1. Store raw measurements and computed statistics
2. Include metadata about test conditions
3. Provide methods for statistical analysis
4. Support serialization for result persistence
EXAMPLE:
>>> result = BenchmarkResult("model_accuracy", [0.95, 0.94, 0.96])
>>> print(f"Mean: {result.mean:.3f} ± {result.std:.3f}")
Mean: 0.950 ± 0.010
HINTS:
- Use statistics module for robust mean/std calculations
- Store both raw data and summary statistics
- Include confidence intervals for professional reporting
"""
### BEGIN SOLUTION
metric_name: str
values: List[float]
metadata: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
"""Compute statistics after initialization."""
if not self.values:
raise ValueError(
"BenchmarkResult requires at least one measurement.\n"
" Issue: Cannot compute statistics without any measurements.\n"
" Fix: Ensure benchmark runs produce at least one measurement before creating BenchmarkResult."
)
self.mean = statistics.mean(self.values)
self.std = statistics.stdev(self.values) if len(self.values) > 1 else 0.0
self.median = statistics.median(self.values)
self.min_val = min(self.values)
self.max_val = max(self.values)
self.count = len(self.values)
# 95% confidence interval for the mean
if len(self.values) > 1:
t_score = 1.96 # Approximate for large samples
margin_error = t_score * (self.std / np.sqrt(self.count))
self.ci_lower = self.mean - margin_error
self.ci_upper = self.mean + margin_error
else:
self.ci_lower = self.ci_upper = self.mean
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
'metric_name': self.metric_name,
'values': self.values,
'mean': self.mean,
'std': self.std,
'median': self.median,
'min': self.min_val,
'max': self.max_val,
'count': self.count,
'ci_lower': self.ci_lower,
'ci_upper': self.ci_upper,
'metadata': self.metadata
}
def __str__(self) -> str:
return f"{self.metric_name}: {self.mean:.4f} ± {self.std:.4f} (n={self.count})"
### END SOLUTION
def test_unit_benchmark_result():
"""🔬 Test BenchmarkResult statistical calculations."""
print("🔬 Unit Test: BenchmarkResult...")
# Test basic statistics
values = [1.0, 2.0, 3.0, 4.0, 5.0]
result = BenchmarkResult("test_metric", values)
assert result.mean == 3.0
assert abs(result.std - statistics.stdev(values)) < 1e-10
assert result.median == 3.0
assert result.min_val == 1.0
assert result.max_val == 5.0
assert result.count == 5
# Test confidence intervals
assert result.ci_lower < result.mean < result.ci_upper
# Test serialization
result_dict = result.to_dict()
assert result_dict['metric_name'] == "test_metric"
assert result_dict['mean'] == 3.0
print("✅ BenchmarkResult works correctly!")
if __name__ == "__main__":
test_unit_benchmark_result()
# %% ../../modules/19_benchmarking/19_benchmarking.ipynb 13
class Benchmark:
"""

View File

@@ -293,11 +293,10 @@ class MultiHeadAttention:
mask_reshaped = mask
if mask is not None and len(mask.shape) == 3:
# Add head dimension: (batch, seq, seq) -> (batch, 1, seq, seq)
# Note: Tensor.reshape doesn't support adding dims easily without full shape
# But we can use numpy reshape on data and wrap in Tensor?
# Or just rely on broadcasting if mask is 2D?
# In the proof script, mask is None, so this is fine.
pass
# This allows the mask to broadcast across all attention heads
batch_size_mask, seq_len_mask, _ = mask.shape
mask_data = mask.data.reshape(batch_size_mask, 1, seq_len_mask, seq_len_mask)
mask_reshaped = Tensor(mask_data, requires_grad=False)
attended, _ = scaled_dot_product_attention(Q, K, V, mask=mask_reshaped)

View File

@@ -446,6 +446,7 @@ class EmbeddingBackward(Function):
return (grad_weight,)
#| export
class SliceBackward(Function):
"""
@@ -1298,6 +1299,6 @@ def enable_autograd(quiet=False):
print(" - requires_grad=True enables tracking")
# Auto-enable when module is imported
# Check TINYTORCH_QUIET env var to suppress messages (for CLI tools)
# Always quiet to avoid cluttering user imports
import os
enable_autograd(quiet=os.environ.get('TINYTORCH_QUIET', '').lower() in ('1', 'true', 'yes'))
enable_autograd(quiet=True)

View File

@@ -15,7 +15,7 @@
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['XAVIER_SCALE_FACTOR', 'HE_SCALE_FACTOR', 'DROPOUT_MIN_PROB', 'DROPOUT_MAX_PROB', 'Layer', 'Linear', 'Dropout']
__all__ = ['XAVIER_SCALE_FACTOR', 'HE_SCALE_FACTOR', 'DROPOUT_MIN_PROB', 'DROPOUT_MAX_PROB', 'Layer', 'Linear', 'Dense', 'Dropout']
# %% ../../modules/03_layers/03_layers.ipynb 1
import numpy as np
@@ -273,3 +273,7 @@ class Dropout(Layer):
def __repr__(self):
return f"Dropout(p={self.p})"
# Alias for compatibility - Dense is the same as Linear
# Some frameworks use Dense, some use Linear - they're identical
Dense = Linear

View File

@@ -240,9 +240,14 @@ class SGD(Optimizer):
if param.grad is None:
continue
# Get gradient data (grad is a Tensor from Module 01)
# Get gradient data - grad can be Tensor or numpy array
grad = param.grad
grad_data = grad.data
# Handle both Tensor (with .data) and numpy array (from autograd) cases
if isinstance(grad, Tensor):
grad_data = grad.data
else:
# grad is already a numpy array from autograd
grad_data = grad
# Apply weight decay
if self.weight_decay != 0:
@@ -342,9 +347,14 @@ class Adam(Optimizer):
if param.grad is None:
continue
# Get gradient data (grad is a Tensor from Module 01)
# Get gradient data - grad can be Tensor or numpy array
grad = param.grad
grad_data = grad.data
# Handle both Tensor (with .data) and numpy array (from autograd) cases
if isinstance(grad, Tensor):
grad_data = grad.data
else:
# grad is already a numpy array from autograd
grad_data = grad
# Apply weight decay
if self.weight_decay != 0:
@@ -446,9 +456,14 @@ class AdamW(Optimizer):
if param.grad is None:
continue
# Get gradient data (NOT modified by weight decay)
# Get gradient data - grad can be Tensor or numpy array
grad = param.grad
grad_data = grad.data
# Handle both Tensor (with .data) and numpy array (from autograd) cases
if isinstance(grad, Tensor):
grad_data = grad.data
else:
# grad is already a numpy array from autograd
grad_data = grad
# Initialize buffers if needed
if self.m_buffers[i] is None:

View File

@@ -16,7 +16,7 @@
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['DEFAULT_KERNEL_SIZE', 'DEFAULT_STRIDE', 'DEFAULT_PADDING', 'Conv2dBackward', 'Conv2d', 'MaxPool2dBackward',
'MaxPool2d', 'AvgPool2d', 'SimpleCNN']
'MaxPool2d', 'AvgPool2d', 'BatchNorm2d', 'SimpleCNN']
# %% ../../modules/09_spatial/09_spatial.ipynb 1
import numpy as np
@@ -133,6 +133,7 @@ class Conv2dBackward(Function):
# Following TinyTorch protocol: return (grad_input, grad_weight, grad_bias)
return grad_input, grad_weight, grad_bias
#| export
class Conv2d:
"""
@@ -392,6 +393,7 @@ class MaxPool2dBackward(Function):
# Return as tuple (following Function protocol)
return (grad_input,)
#| export
class MaxPool2d:
"""
@@ -662,7 +664,160 @@ class AvgPool2d:
"""Enable model(x) syntax."""
return self.forward(x)
# %% ../../modules/09_spatial/09_spatial.ipynb 21
# %% ../../modules/09_spatial/09_spatial.ipynb 15
class BatchNorm2d:
"""
Batch Normalization for 2D spatial inputs (images).
Normalizes activations across batch and spatial dimensions for each channel,
then applies learnable scale (gamma) and shift (beta) parameters.
Key behaviors:
- Training: Uses batch statistics, updates running statistics
- Eval: Uses frozen running statistics for consistent inference
Args:
num_features: Number of channels (C in NCHW format)
eps: Small constant for numerical stability (default: 1e-5)
momentum: Momentum for running statistics update (default: 0.1)
"""
def __init__(self, num_features, eps=1e-5, momentum=0.1):
"""
Initialize BatchNorm2d layer.
TODO: Initialize learnable and running parameters
APPROACH:
1. Store hyperparameters (num_features, eps, momentum)
2. Initialize gamma (scale) to ones - identity at start
3. Initialize beta (shift) to zeros - no shift at start
4. Initialize running_mean to zeros
5. Initialize running_var to ones
6. Set training mode to True initially
EXAMPLE:
>>> bn = BatchNorm2d(64) # For 64-channel feature maps
>>> print(bn.gamma.shape) # (64,)
>>> print(bn.training) # True
"""
super().__init__()
### BEGIN SOLUTION
self.num_features = num_features
self.eps = eps
self.momentum = momentum
# Learnable parameters (requires_grad=True for training)
# gamma (scale): initialized to 1 so output = normalized input initially
self.gamma = Tensor(np.ones(num_features), requires_grad=True)
# beta (shift): initialized to 0 so no shift initially
self.beta = Tensor(np.zeros(num_features), requires_grad=True)
# Running statistics (not trained, accumulated during training)
# These are used during evaluation for consistent normalization
self.running_mean = np.zeros(num_features)
self.running_var = np.ones(num_features)
# Training mode flag
self.training = True
### END SOLUTION
def train(self):
"""Set layer to training mode."""
self.training = True
return self
def eval(self):
"""Set layer to evaluation mode."""
self.training = False
return self
def forward(self, x):
"""
Forward pass through BatchNorm2d.
TODO: Implement batch normalization forward pass
APPROACH:
1. Validate input shape (must be 4D: batch, channels, height, width)
2. If training:
a. Compute batch mean and variance per channel
b. Normalize using batch statistics
c. Update running statistics with momentum
3. If eval:
a. Use running mean and variance
b. Normalize using frozen statistics
4. Apply scale (gamma) and shift (beta)
EXAMPLE:
>>> bn = BatchNorm2d(16)
>>> x = Tensor(np.random.randn(2, 16, 8, 8)) # batch=2, channels=16, 8x8
>>> y = bn(x)
>>> print(y.shape) # (2, 16, 8, 8) - same shape
HINTS:
- Compute mean/var over axes (0, 2, 3) to get per-channel statistics
- Reshape gamma/beta to (1, C, 1, 1) for broadcasting
- Running stat update: running = (1 - momentum) * running + momentum * batch
"""
### BEGIN SOLUTION
# Input validation
if len(x.shape) != 4:
raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
batch_size, channels, height, width = x.shape
if channels != self.num_features:
raise ValueError(f"Expected {self.num_features} channels, got {channels}")
if self.training:
# Compute batch statistics per channel
# Mean over batch and spatial dimensions: axes (0, 2, 3)
batch_mean = np.mean(x.data, axis=(0, 2, 3)) # Shape: (C,)
batch_var = np.var(x.data, axis=(0, 2, 3)) # Shape: (C,)
# Update running statistics (exponential moving average)
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
# Use batch statistics for normalization
mean = batch_mean
var = batch_var
else:
# Use running statistics (frozen during eval)
mean = self.running_mean
var = self.running_var
# Normalize: (x - mean) / sqrt(var + eps)
# Reshape mean and var for broadcasting: (C,) -> (1, C, 1, 1)
mean_reshaped = mean.reshape(1, channels, 1, 1)
var_reshaped = var.reshape(1, channels, 1, 1)
x_normalized = (x.data - mean_reshaped) / np.sqrt(var_reshaped + self.eps)
# Apply scale (gamma) and shift (beta)
# Reshape for broadcasting: (C,) -> (1, C, 1, 1)
gamma_reshaped = self.gamma.data.reshape(1, channels, 1, 1)
beta_reshaped = self.beta.data.reshape(1, channels, 1, 1)
output = gamma_reshaped * x_normalized + beta_reshaped
# Return Tensor with gradient tracking
result = Tensor(output, requires_grad=x.requires_grad or self.gamma.requires_grad)
return result
### END SOLUTION
def parameters(self):
"""Return learnable parameters (gamma and beta)."""
return [self.gamma, self.beta]
def __call__(self, x):
"""Enable model(x) syntax."""
return self.forward(x)
# %% ../../modules/09_spatial/09_spatial.ipynb 25
class SimpleCNN:
"""
Simple CNN demonstrating spatial operations integration.

View File

@@ -146,8 +146,9 @@ class Tensor:
new_shape[unknown_idx] = unknown_dim
new_shape = tuple(new_shape)
if np.prod(new_shape) != self.size:
target_size = int(np.prod(new_shape))
raise ValueError(
f"Cannot reshape tensor of size {self.size} to shape {new_shape}"
f"Total elements must match: {self.size} {target_size}"
)
reshaped_data = np.reshape(self.data, new_shape)
result = Tensor(reshaped_data, requires_grad=self.requires_grad)

View File

@@ -15,7 +15,7 @@
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['DEFAULT_MAX_LR', 'DEFAULT_MIN_LR', 'DEFAULT_TOTAL_EPOCHS', 'CosineSchedule', 'Trainer']
__all__ = ['DEFAULT_MAX_LR', 'DEFAULT_MIN_LR', 'DEFAULT_TOTAL_EPOCHS', 'CosineSchedule', 'clip_grad_norm', 'Trainer']
# %% ../../modules/07_training/07_training.ipynb 1
import numpy as np
@@ -76,6 +76,67 @@ class CosineSchedule:
return self.min_lr + (self.max_lr - self.min_lr) * cosine_factor
### END SOLUTION
# %% ../../modules/07_training/07_training.ipynb 10
def clip_grad_norm(parameters: List, max_norm: float = 1.0) -> float:
"""
Clip gradients by global norm to prevent exploding gradients.
This is crucial for training stability, especially with RNNs and deep networks.
Instead of clipping each gradient individually, we compute the global norm
across all parameters and scale uniformly if needed.
TODO: Implement gradient clipping by global norm
APPROACH:
1. Compute total norm: sqrt(sum of squared gradients across all parameters)
2. If total_norm > max_norm, compute clip_coef = max_norm / total_norm
3. Scale all gradients by clip_coef: grad *= clip_coef
4. Return the original norm for monitoring
EXAMPLE:
>>> params = [Tensor([1, 2, 3], requires_grad=True)]
>>> params[0].grad = Tensor([10, 20, 30]) # Large gradients
>>> original_norm = clip_grad_norm(params, max_norm=1.0)
>>> print(f"Clipped norm: {np.linalg.norm(params[0].grad.data):.2f}") # Should be ≤ 1.0
HINTS:
- Use np.linalg.norm() to compute norms
- Only clip if total_norm > max_norm
- Modify gradients in-place for efficiency
"""
### BEGIN SOLUTION
if not parameters:
return 0.0
# Collect all gradients and compute global norm
total_norm = 0.0
for param in parameters:
if param.grad is not None:
# Handle both Tensor gradients and numpy array gradients
if isinstance(param.grad, np.ndarray):
grad_data = param.grad
else:
# Trust that Tensor has .data attribute
grad_data = param.grad.data
total_norm += np.sum(grad_data ** 2)
total_norm = np.sqrt(total_norm)
# Clip if necessary
if total_norm > max_norm:
clip_coef = max_norm / total_norm
for param in parameters:
if param.grad is not None:
# Handle both Tensor gradients and numpy array gradients
if isinstance(param.grad, np.ndarray):
param.grad = param.grad * clip_coef
else:
# Trust that Tensor has .data attribute
param.grad.data = param.grad.data * clip_coef
return float(total_norm)
### END SOLUTION
# %% ../../modules/07_training/07_training.ipynb 14
class Trainer:
"""

209
tinytorch/data/loader.py generated
View File

@@ -15,7 +15,7 @@
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Dataset', 'TensorDataset', 'DataLoader']
__all__ = ['Dataset', 'TensorDataset', 'DataLoader', 'RandomHorizontalFlip', 'RandomCrop', 'Compose']
# %% ../../modules/08_dataloader/08_dataloader.ipynb 0
#| default_exp data.loader
@@ -262,3 +262,210 @@ class DataLoader:
return tuple(batched_tensors)
### END SOLUTION
# %% ../../modules/08_dataloader/08_dataloader.ipynb 12
class RandomHorizontalFlip:
"""
Randomly flip images horizontally with given probability.
A simple but effective augmentation for most image datasets.
Flipping is appropriate when horizontal orientation doesn't change class
(cats, dogs, cars - not digits or text!).
Args:
p: Probability of flipping (default: 0.5)
"""
def __init__(self, p=0.5):
"""
Initialize RandomHorizontalFlip.
TODO: Store flip probability
EXAMPLE:
>>> flip = RandomHorizontalFlip(p=0.5) # 50% chance to flip
"""
### BEGIN SOLUTION
if not 0.0 <= p <= 1.0:
raise ValueError(f"Probability must be between 0 and 1, got {p}")
self.p = p
### END SOLUTION
def __call__(self, x):
"""
Apply random horizontal flip to input.
TODO: Implement random horizontal flip
APPROACH:
1. Generate random number in [0, 1)
2. If random < p, flip horizontally
3. Otherwise, return unchanged
Args:
x: Input array with shape (..., H, W) or (..., H, W, C)
Flips along the last-1 axis (width dimension)
Returns:
Flipped or unchanged array (same shape as input)
EXAMPLE:
>>> flip = RandomHorizontalFlip(0.5)
>>> img = np.array([[1, 2, 3], [4, 5, 6]]) # 2x3 image
>>> # 50% chance output is [[3, 2, 1], [6, 5, 4]]
HINT: Use np.flip(x, axis=-1) to flip along width axis
"""
### BEGIN SOLUTION
if np.random.random() < self.p:
# Flip along the width axis (last axis for HW format, second-to-last for HWC)
# Using axis=-1 works for both (..., H, W) and (..., H, W, C)
if isinstance(x, Tensor):
return Tensor(np.flip(x.data, axis=-1).copy())
else:
return np.flip(x, axis=-1).copy()
return x
### END SOLUTION
#| export
class RandomCrop:
"""
Randomly crop image after padding.
This is the standard augmentation for CIFAR-10:
1. Pad image by `padding` pixels on each side
2. Randomly crop back to original size
This simulates small translations in the image, forcing the model
to recognize objects regardless of their exact position.
Args:
size: Output crop size (int for square, or tuple (H, W))
padding: Pixels to pad on each side before cropping (default: 4)
"""
def __init__(self, size, padding=4):
"""
Initialize RandomCrop.
TODO: Store crop parameters
EXAMPLE:
>>> crop = RandomCrop(32, padding=4) # CIFAR-10 standard
>>> # Pads to 40x40, then crops back to 32x32
"""
### BEGIN SOLUTION
if isinstance(size, int):
self.size = (size, size)
else:
self.size = size
self.padding = padding
### END SOLUTION
def __call__(self, x):
"""
Apply random crop after padding.
TODO: Implement random crop with padding
APPROACH:
1. Add zero-padding to all sides
2. Choose random top-left corner for crop
3. Extract crop of target size
Args:
x: Input image with shape (C, H, W) or (H, W) or (H, W, C)
Assumes spatial dimensions are H, W
Returns:
Cropped image with target size
EXAMPLE:
>>> crop = RandomCrop(32, padding=4)
>>> img = np.random.randn(3, 32, 32) # CIFAR-10 format (C, H, W)
>>> out = crop(img)
>>> print(out.shape) # (3, 32, 32)
HINTS:
- Use np.pad for adding zeros
- Handle both (C, H, W) and (H, W) formats
- Random offsets should be in [0, 2*padding]
"""
### BEGIN SOLUTION
is_tensor = isinstance(x, Tensor)
data = x.data if is_tensor else x
target_h, target_w = self.size
# Determine image format and dimensions
if len(data.shape) == 2:
# (H, W) format
h, w = data.shape
padded = np.pad(data, self.padding, mode='constant', constant_values=0)
# Random crop position
top = np.random.randint(0, 2 * self.padding + h - target_h + 1)
left = np.random.randint(0, 2 * self.padding + w - target_w + 1)
cropped = padded[top:top + target_h, left:left + target_w]
elif len(data.shape) == 3:
if data.shape[0] <= 4: # Likely (C, H, W) format
c, h, w = data.shape
# Pad only spatial dimensions
padded = np.pad(data,
((0, 0), (self.padding, self.padding), (self.padding, self.padding)),
mode='constant', constant_values=0)
# Random crop position
top = np.random.randint(0, 2 * self.padding + 1)
left = np.random.randint(0, 2 * self.padding + 1)
cropped = padded[:, top:top + target_h, left:left + target_w]
else: # Likely (H, W, C) format
h, w, c = data.shape
padded = np.pad(data,
((self.padding, self.padding), (self.padding, self.padding), (0, 0)),
mode='constant', constant_values=0)
top = np.random.randint(0, 2 * self.padding + 1)
left = np.random.randint(0, 2 * self.padding + 1)
cropped = padded[top:top + target_h, left:left + target_w, :]
else:
raise ValueError(f"Expected 2D or 3D input, got shape {data.shape}")
return Tensor(cropped) if is_tensor else cropped
### END SOLUTION
#| export
class Compose:
"""
Compose multiple transforms into a pipeline.
Applies transforms in sequence, passing output of each
as input to the next.
Args:
transforms: List of transform callables
"""
def __init__(self, transforms):
"""
Initialize Compose with list of transforms.
EXAMPLE:
>>> transforms = Compose([
... RandomHorizontalFlip(0.5),
... RandomCrop(32, padding=4)
... ])
"""
self.transforms = transforms
def __call__(self, x):
"""Apply all transforms in sequence."""
for transform in self.transforms:
x = transform(x)
return x

View File

@@ -15,7 +15,7 @@
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'KVCache', 'enable_kv_cache', 'disable_kv_cache']
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'KVCache', 'create_kv_cache', 'enable_kv_cache', 'disable_kv_cache']
# %% ../../modules/17_memoization/17_memoization.ipynb 1
import numpy as np
@@ -303,11 +303,11 @@ class KVCache:
}
# %% ../../modules/17_memoization/17_memoization.ipynb 11
def enable_kv_cache(batch_size: int, max_seq_len: int, num_layers: int,
def create_kv_cache(batch_size: int, max_seq_len: int, num_layers: int,
num_heads: int, head_dim: int) -> KVCache:
"""
Create and return a KVCache instance for model generation.
This function creates a properly sized cache for the model architecture.
Call this before starting generation, then pass the cache to your
generation loop.

View File

@@ -15,8 +15,208 @@
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = []
__all__ = ['vectorized_matmul', 'fused_gelu', 'tiled_matmul']
# %% ../../modules/18_acceleration/18_acceleration.ipynb 0
#| default_exp optimization.acceleration
#| export
# %% ../../modules/18_acceleration/18_acceleration.ipynb 7
def vectorized_matmul(a: Tensor, b: Tensor) -> Tensor:
"""
High-performance matrix multiplication using vectorized operations.
This implementation leverages optimized BLAS libraries that use:
- SIMD instructions for parallel computation
- Cache-blocking for memory efficiency
- Multi-threading for CPU parallelization
TODO: Implement production-grade matrix multiplication
APPROACH:
1. Validate shapes are compatible for matrix multiplication
2. Use NumPy's optimized dot product (calls BLAS GEMM)
3. Return result wrapped in Tensor
Args:
a: First tensor for multiplication (M×K or batch×M×K)
b: Second tensor for multiplication (K×N or batch×K×N)
Returns:
Result tensor of shape (M×N or batch×M×N)
EXAMPLE:
Matrix multiplication visualization:
>>> a = Tensor([[1, 2], [3, 4]]) # 2×2
>>> b = Tensor([[5, 6], [7, 8]]) # 2×2
>>> result = vectorized_matmul(a, b)
>>> print(result.data)
[[19 22] # [1×5+2×7, 1×6+2×8] = [19, 22]
[43 50]] # [3×5+4×7, 3×6+4×8] = [43, 50]
PERFORMANCE CHARACTERISTICS:
- Time Complexity: O(N³) but highly optimized
- Space Complexity: O(N²) for result
- Arithmetic Intensity: 2N³ FLOPs / 3N² bytes = 2N/3 (good for large N)
HINTS:
- Check a.shape[-1] == b.shape[-2] for inner dimension match
- Use np.matmul() for batch support and optimization
- Trust BLAS to handle the vectorization magic
"""
### BEGIN SOLUTION
# Input validation for matrix multiplication
if len(a.shape) < 2 or len(b.shape) < 2:
raise ValueError(
f"Matrix multiplication requires 2D+ tensors, got shapes {a.shape} and {b.shape}. "
f"💡 HINT: Use reshape() to add dimensions if needed."
)
if a.shape[-1] != b.shape[-2]:
raise ValueError(
f"Matrix multiplication shape mismatch: {a.shape} @ {b.shape}. "
f"Inner dimensions must match: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}. "
f"💡 HINT: For A@B, A's columns must equal B's rows."
)
# Use NumPy's highly optimized matrix multiplication
# This calls BLAS GEMM (General Matrix Multiply), which uses:
# - SIMD vectorization for parallel arithmetic
# - Cache blocking for memory efficiency
# - Multi-threading on multi-core systems
result_data = np.matmul(a.data, b.data)
return Tensor(result_data)
### END SOLUTION
# %% ../../modules/18_acceleration/18_acceleration.ipynb 10
def fused_gelu(x: Tensor) -> Tensor:
"""
Fused GELU activation that combines all operations in a single kernel.
GELU combines the benefits of ReLU and sigmoid:
- Smooth everywhere (unlike ReLU's discontinuity at 0)
- Non-saturating for positive values (unlike sigmoid)
- Probabilistic interpretation: x * P(X ≤ x) where X ~ N(0,1)
Mathematical Definition:
GELU(x) = x * Φ(x) where Φ(x) is the standard normal CDF
Fast Approximation (used here):
GELU(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
TODO: Implement fused GELU to minimize memory bandwidth
APPROACH:
1. Compute all intermediate values in a single expression
2. Avoid creating temporary arrays
3. Let NumPy's broadcasting handle vectorization
Args:
x: Input tensor to apply GELU activation
Returns:
GELU-activated tensor (same shape as input)
EXAMPLE:
>>> x = Tensor([-2, -1, 0, 1, 2])
>>> result = fused_gelu(x)
>>> print(result.data)
[-0.04550026 -0.15865526 0. 0.8413447 1.9544997 ]
# Notice: smooth transition through 0, positive bias
MEMORY EFFICIENCY:
- Unfused: 5 temporary arrays × input_size × 4 bytes
- Fused: 0 temporary arrays, direct computation
- Bandwidth reduction: ~80% for memory-bound operations
HINTS:
- Use np.sqrt(2.0 / np.pi) for the constant
- Keep entire expression in one line for maximum fusion
- NumPy will optimize the expression tree automatically
"""
### BEGIN SOLUTION
# Mathematical constant for GELU approximation
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
# Fused GELU computation - all operations in single expression
# This minimizes memory bandwidth by avoiding intermediate arrays
# NumPy's expression evaluator will optimize this into efficient machine code
result_data = 0.5 * x.data * (
1.0 + np.tanh(sqrt_2_over_pi * (x.data + 0.044715 * x.data**3))
)
return Tensor(result_data)
### END SOLUTION
# %% ../../modules/18_acceleration/18_acceleration.ipynb 16
def tiled_matmul(a: Tensor, b: Tensor, tile_size: int = 64) -> Tensor:
"""
Cache-aware matrix multiplication using tiling/blocking.
Demonstrates blocking algorithm for cache optimization by breaking
large matrix multiplications into cache-sized chunks.
TODO: Implement cache-aware tiled matrix multiplication
APPROACH:
1. Validate inputs for matrix multiplication compatibility
2. Use NumPy's optimized matmul (which already implements tiling internally)
3. In production, explicit tiling would use nested loops over blocks
Args:
a: First matrix (M×K)
b: Second matrix (K×N)
tile_size: Block size for cache efficiency (default: 64)
Returns:
Result matrix (M×N)
EXAMPLE:
>>> a = Tensor(np.random.randn(256, 256))
>>> b = Tensor(np.random.randn(256, 256))
>>> result = tiled_matmul(a, b, tile_size=64)
>>> # Same result as vectorized_matmul, but more cache-friendly for large matrices
PERFORMANCE CHARACTERISTICS:
- Reduces cache misses by working on blocks that fit in L1/L2
- Especially beneficial for matrices larger than cache size
- tile_size should match cache line size (typically 64 bytes)
HINTS:
- For educational purposes, we use NumPy's optimized BLAS
- BLAS libraries (MKL, OpenBLAS) already implement cache blocking
- Explicit tiling would use 6 nested loops (3 for tiles, 3 for elements)
"""
### BEGIN SOLUTION
# Input validation
if len(a.shape) < 2 or len(b.shape) < 2:
raise ValueError(
f"Tiled matmul requires 2D+ tensors, got shapes {a.shape} and {b.shape}. "
f"💡 HINT: Tiling works on matrix operations."
)
if a.shape[-1] != b.shape[-2]:
raise ValueError(
f"Shape mismatch: {a.shape} @ {b.shape}. "
f"Inner dimensions must match for matrix multiplication. "
f"💡 HINT: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}"
)
# For educational purposes, we use NumPy's matmul which already
# implements cache-aware tiling via BLAS libraries (MKL, OpenBLAS)
# These libraries automatically partition large matrices into
# cache-sized blocks for optimal performance
# In a full educational implementation, you would write:
# for i_tile in range(0, M, tile_size):
# for j_tile in range(0, N, tile_size):
# for k_tile in range(0, K, tile_size):
# # Multiply tile blocks that fit in cache
# C[i_tile:i_tile+tile_size, j_tile:j_tile+tile_size] +=
# A[i_tile:i_tile+tile_size, k_tile:k_tile+tile_size] @
# B[k_tile:k_tile+tile_size, j_tile:j_tile+tile_size]
result_data = np.matmul(a.data, b.data)
return Tensor(result_data)
### END SOLUTION

View File

@@ -15,8 +15,8 @@
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'magnitude_prune', 'structured_prune', 'KnowledgeDistillation',
'CompressionComplete', 'measure_sparsity', 'compress_model']
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'magnitude_prune', 'structured_prune', 'low_rank_approximate',
'KnowledgeDistillation', 'CompressionComplete', 'measure_sparsity', 'compress_model']
# %% ../../modules/16_compression/16_compression.ipynb 1
import numpy as np
@@ -145,6 +145,48 @@ def structured_prune(model, prune_ratio=0.5):
return model
### END SOLUTION
# %% ../../modules/16_compression/16_compression.ipynb 18
def low_rank_approximate(weight_matrix, rank_ratio=0.5):
"""
Approximate weight matrix using low-rank decomposition (SVD).
TODO: Implement SVD-based low-rank approximation
APPROACH:
1. Perform SVD: W = U @ S @ V^T
2. Keep only top k singular values where k = rank_ratio * min(dimensions)
3. Reconstruct: W_approx = U[:,:k] @ diag(S[:k]) @ V[:k,:]
4. Return decomposed matrices for memory savings
EXAMPLE:
>>> weight = np.random.randn(100, 50)
>>> U, S, V = low_rank_approximate(weight, rank_ratio=0.3)
>>> # Original: 100*50 = 5000 params
>>> # Compressed: 100*15 + 15*50 = 2250 params (55% reduction)
HINTS:
- Use np.linalg.svd() for decomposition
- Choose k = int(rank_ratio * min(m, n))
- Return U[:,:k], S[:k], V[:k,:] for reconstruction
"""
### BEGIN SOLUTION
m, n = weight_matrix.shape
# Perform SVD
U, S, V = np.linalg.svd(weight_matrix, full_matrices=False)
# Determine target rank
max_rank = min(m, n)
target_rank = max(1, int(rank_ratio * max_rank))
# Truncate to target rank
U_truncated = U[:, :target_rank]
S_truncated = S[:target_rank]
V_truncated = V[:target_rank, :]
return U_truncated, S_truncated, V_truncated
### END SOLUTION
# %% ../../modules/16_compression/16_compression.ipynb 21
class KnowledgeDistillation:
"""

View File

@@ -16,8 +16,8 @@
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['INT8_MIN_VALUE', 'INT8_MAX_VALUE', 'INT8_RANGE', 'EPSILON', 'BYTES_PER_FLOAT32', 'BYTES_PER_INT8', 'MB_TO_BYTES',
'SimpleModel', 'QuantizedLinear', 'QuantizationComplete', 'quantize_int8', 'dequantize_int8',
'quantize_model']
'SimpleModel', 'QuantizedLinear', 'compare_model_sizes', 'QuantizationComplete', 'quantize_int8',
'dequantize_int8', 'quantize_model']
# %% ../../modules/15_quantization/15_quantization.ipynb 3
import numpy as np
@@ -198,6 +198,84 @@ class QuantizedLinear:
}
### END SOLUTION
# %% ../../modules/15_quantization/15_quantization.ipynb 24
def compare_model_sizes(original_model, quantized_model) -> Dict[str, float]:
"""
Compare memory usage between original and quantized models.
TODO: Calculate comprehensive memory comparison
APPROACH:
1. Count parameters in both models
2. Calculate bytes used (FP32 vs INT8)
3. Include quantization overhead
4. Return comparison metrics
Args:
original_model: Model before quantization
quantized_model: Model after quantization
Returns:
Dictionary with 'original_mb', 'quantized_mb', 'reduction_ratio', 'memory_saved_mb'
EXAMPLE:
>>> layer1 = Linear(100, 50)
>>> layer2 = Linear(50, 10)
>>> model = SimpleModel(layer1, layer2)
>>> quantize_model(model)
>>> stats = compare_model_sizes(model, model) # Same model after in-place quantization
>>> print(f"Reduced to {stats['reduction_ratio']:.1f}x smaller")
Reduced to 4.0x smaller
HINTS:
- FP32 uses 4 bytes per parameter, INT8 uses 1 byte
- Include scale/zero_point overhead (2 values per quantized layer)
- Expected ratio: ~4x for INT8 quantization
"""
### BEGIN SOLUTION
# Count original model parameters
# SimpleModel has .layers attribute, layers may have .parameters() method
original_params = 0
original_bytes = 0
for layer in original_model.layers:
if hasattr(layer, 'parameters'):
params = layer.parameters()
for param in params:
original_params += param.data.size
original_bytes += param.data.size * BYTES_PER_FLOAT32
# Count quantized model parameters
quantized_params = 0
quantized_bytes = 0
for layer in quantized_model.layers:
if isinstance(layer, QuantizedLinear):
memory_info = layer.memory_usage()
quantized_bytes += memory_info['quantized_bytes']
params = layer.parameters()
for param in params:
quantized_params += param.data.size
else:
# Non-quantized layers - may have .parameters() method
if hasattr(layer, 'parameters'):
params = layer.parameters()
for param in params:
quantized_params += param.data.size
quantized_bytes += param.data.size * BYTES_PER_FLOAT32
compression_ratio = original_bytes / quantized_bytes if quantized_bytes > 0 else 1.0
memory_saved = original_bytes - quantized_bytes
return {
'original_params': original_params,
'quantized_params': quantized_params,
'original_bytes': original_bytes,
'quantized_bytes': quantized_bytes,
'compression_ratio': compression_ratio,
'memory_saved_mb': memory_saved / MB_TO_BYTES,
'memory_saved_percent': (memory_saved / original_bytes) * 100 if original_bytes > 0 else 0
}
### END SOLUTION
# %% ../../modules/15_quantization/15_quantization.ipynb 36
class QuantizationComplete:
"""

View File

@@ -15,7 +15,8 @@
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'Embedding', 'PositionalEncoding', 'EmbeddingLayer']
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'Embedding', 'PositionalEncoding', 'create_sinusoidal_embeddings',
'EmbeddingLayer']
# %% ../../modules/11_embeddings/11_embeddings.ipynb 2
import numpy as np
@@ -226,6 +227,67 @@ class PositionalEncoding:
return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})"
### END SOLUTION
# %% ../../modules/11_embeddings/11_embeddings.ipynb 14
def create_sinusoidal_embeddings(max_seq_len: int, embed_dim: int) -> Tensor:
"""
Create sinusoidal positional encodings as used in "Attention Is All You Need".
These fixed encodings use sine and cosine functions to create unique
positional patterns that don't require training and can extrapolate
to longer sequences than seen during training.
TODO: Implement sinusoidal positional encoding generation
APPROACH:
1. Create position indices: [0, 1, 2, ..., max_seq_len-1]
2. Create dimension indices for frequency calculation
3. Apply sine to even dimensions, cosine to odd dimensions
4. Use the transformer paper formula with 10000 base
MATHEMATICAL FORMULA:
PE(pos, 2i) = sin(pos / 10000^(2i/embed_dim))
PE(pos, 2i+1) = cos(pos / 10000^(2i/embed_dim))
EXAMPLE:
>>> pe = create_sinusoidal_embeddings(512, 64)
>>> print(pe.shape)
(512, 64)
>>> # Position 0: [0, 1, 0, 1, 0, 1, ...] (sin(0)=0, cos(0)=1)
>>> # Each position gets unique trigonometric signature
HINTS:
- Use np.arange to create position and dimension arrays
- Calculate div_term using exponential for frequency scaling
- Apply different formulas to even/odd dimensions
- The 10000 base creates different frequencies for different dimensions
"""
### BEGIN SOLUTION
# Create position indices [0, 1, 2, ..., max_seq_len-1]
position = np.arange(max_seq_len, dtype=np.float32)[:, np.newaxis] # (max_seq_len, 1)
# Create dimension indices for calculating frequencies
div_term = np.exp(
np.arange(0, embed_dim, 2, dtype=np.float32) *
-(math.log(10000.0) / embed_dim)
) # (embed_dim//2,)
# Initialize the positional encoding matrix
pe = np.zeros((max_seq_len, embed_dim), dtype=np.float32)
# Apply sine to even indices (0, 2, 4, ...)
pe[:, 0::2] = np.sin(position * div_term)
# Apply cosine to odd indices (1, 3, 5, ...)
if embed_dim % 2 == 1:
# Handle odd embed_dim by only filling available positions
pe[:, 1::2] = np.cos(position * div_term[:-1])
else:
pe[:, 1::2] = np.cos(position * div_term)
return Tensor(pe)
### END SOLUTION
# %% ../../modules/11_embeddings/11_embeddings.ipynb 18
class EmbeddingLayer:
"""

View File

@@ -14,6 +14,7 @@ from .src import SrcCommand
from .nbgrader import NBGraderCommand
from .benchmark import BenchmarkCommand
from .community import CommunityCommand
from .verify import VerifyCommand
# Command groups (with subcommands organized in subfolders)
from .system import SystemCommand
@@ -29,6 +30,7 @@ __all__ = [
'NBGraderCommand',
'BenchmarkCommand',
'CommunityCommand',
'VerifyCommand',
# Command groups
'SystemCommand',
'ModuleWorkflowCommand',

232
tito/commands/verify.py Normal file
View File

@@ -0,0 +1,232 @@
"""
TinyTorch Verify Command
Checks that the environment is set up correctly and ready to use.
On success, prompts to join the community map.
This is essentially `tito system health` + package import check + postcard.
"""
import sys
import os
import webbrowser
from argparse import ArgumentParser, Namespace
from pathlib import Path
from rich.panel import Panel
from rich.table import Table
from rich import box
from .base import BaseCommand
class VerifyCommand(BaseCommand):
"""Verify TinyTorch setup is ready, then join the community."""
@property
def name(self) -> str:
return "verify"
@property
def description(self) -> str:
return "Verify setup is ready, then join the community map"
def add_arguments(self, parser: ArgumentParser) -> None:
parser.add_argument(
"--skip-registration",
action="store_true",
help="Skip registration prompt after verification"
)
def run(self, args: Namespace) -> int:
"""Run verification checks and prompt for registration."""
self.console.print()
self.console.print(Panel.fit(
"[bold cyan]🔬 Verifying TinyTorch Setup[/bold cyan]",
border_style="cyan"
))
self.console.print()
all_passed = True
# 1. Environment checks
all_passed &= self._check_environment()
# 2. Project structure checks
all_passed &= self._check_structure()
# 3. Package import checks
all_passed &= self._check_package()
# Result
self.console.print()
if all_passed:
self._show_success()
if not args.skip_registration:
self._prompt_registration()
return 0
else:
self._show_failure()
return 1
def _check_environment(self) -> bool:
"""Check Python environment and dependencies."""
self.console.print("[bold]Environment[/bold]")
all_ok = True
# Python
self.console.print(f" [green]✓[/green] Python {sys.version.split()[0]}")
# Virtual environment
venv_exists = self.venv_path.exists()
in_venv = (
os.environ.get('VIRTUAL_ENV') is not None or
(hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix) or
hasattr(sys, 'real_prefix')
)
if venv_exists and in_venv:
self.console.print(" [green]✓[/green] Virtual environment active")
elif venv_exists:
self.console.print(" [yellow]![/yellow] Virtual environment exists but not active")
self.console.print(" [dim]Run: source activate.sh[/dim]")
else:
self.console.print(" [yellow]![/yellow] No virtual environment")
# Required dependencies
required = [
('numpy', 'NumPy'),
('rich', 'Rich'),
('yaml', 'PyYAML'),
]
for module, name in required:
try:
__import__(module)
self.console.print(f" [green]✓[/green] {name}")
except ImportError:
self.console.print(f" [red]✗[/red] {name} [dim](pip install {module})[/dim]")
all_ok = False
self.console.print()
return all_ok
def _check_structure(self) -> bool:
"""Check project structure exists."""
self.console.print("[bold]Project Structure[/bold]")
all_ok = True
paths = [
('tinytorch/', 'Package'),
('tinytorch/core/', 'Core modules'),
('src/', 'Source modules'),
]
for path, desc in paths:
if Path(path).exists():
self.console.print(f" [green]✓[/green] {path}")
else:
self.console.print(f" [red]✗[/red] {path} [dim]({desc})[/dim]")
all_ok = False
self.console.print()
return all_ok
def _check_package(self) -> bool:
"""Check that tinytorch package is importable."""
self.console.print("[bold]Package[/bold]")
all_ok = True
# Import tinytorch
try:
import tinytorch
self.console.print(" [green]✓[/green] import tinytorch")
except ImportError as e:
self.console.print(f" [red]✗[/red] import tinytorch")
self.console.print(f" [dim red]{e}[/dim red]")
return False
# Check core components
try:
from tinytorch import Tensor
self.console.print(" [green]✓[/green] Tensor available")
except ImportError:
self.console.print(" [red]✗[/red] Tensor not available")
all_ok = False
try:
from tinytorch import Linear, ReLU
self.console.print(" [green]✓[/green] Layers available")
except ImportError:
self.console.print(" [red]✗[/red] Layers not available")
all_ok = False
try:
from tinytorch import SGD
self.console.print(" [green]✓[/green] Optimizer available")
except ImportError:
self.console.print(" [red]✗[/red] Optimizer not available")
all_ok = False
return all_ok
def _show_success(self) -> None:
"""Show success message."""
self.console.print(Panel.fit(
"[bold green]✅ TinyTorch is ready![/bold green]\n\n"
"Your environment is set up correctly.\n"
"You can start working on modules.",
border_style="green",
box=box.ROUNDED
))
def _show_failure(self) -> None:
"""Show failure message."""
self.console.print(Panel.fit(
"[bold red]❌ Setup incomplete[/bold red]\n\n"
"Some checks failed. See above for details.\n\n"
"[dim]Run 'tito setup' to fix common issues[/dim]",
border_style="red",
box=box.ROUNDED
))
def _prompt_registration(self) -> None:
"""Prompt user to join the community."""
from rich.prompt import Confirm
self.console.print()
self.console.print(Panel.fit(
"[bold cyan]🌍 Join the TinyTorch Community[/bold cyan]\n\n"
"Add yourself to the map at [link=https://tinytorch.ai/map]tinytorch.ai/map[/link]\n\n"
"[dim]• See learners worldwide\n"
"• Country & institution (optional)\n"
"• No account required[/dim]",
border_style="cyan"
))
join = Confirm.ask("\n[bold]Join the community?[/bold]", default=True)
if join:
self._open_registration()
else:
self.console.print("[dim]No problem! Run 'tito verify' anytime to join later.[/dim]")
def _open_registration(self) -> None:
"""Open registration page."""
url = "https://tinytorch.ai/join"
self.console.print(f"\n[cyan]Opening registration...[/cyan]")
try:
webbrowser.open(url)
self.console.print(f"[green]✓[/green] Browser opened")
self.console.print(f"[dim] {url}[/dim]")
except Exception:
self.console.print(f"[yellow]Could not open browser.[/yellow]")
self.console.print(f"Please visit: [cyan]{url}[/cyan]")
self.console.print("\n[green]Welcome to the community! 🎉[/green]")

View File

@@ -38,6 +38,7 @@ from .commands.milestone import MilestoneCommand
from .commands.setup import SetupCommand
from .commands.benchmark import BenchmarkCommand
from .commands.community import CommunityCommand
from .commands.verify import VerifyCommand
# Configure logging
logging.basicConfig(
@@ -79,6 +80,8 @@ class TinyTorchCLI:
'test': TestCommand,
'grade': GradeCommand,
'logo': LogoCommand,
# Verification
'verify': VerifyCommand,
}
# Command categorization for help display
@@ -91,6 +94,7 @@ class TinyTorchCLI:
('[green]tito setup[/green]', 'First-time setup'),
('[green]tito module start 01[/green]', 'Start Module 01 (tensors)'),
('[green]tito module complete 01[/green]', 'Test, export, and track progress'),
('[green]tito verify[/green]', 'Verify installation and join community'),
],
'track_progress': [
('[yellow]tito module status[/yellow]', 'View module progress'),