mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2025-12-05 19:17:52 -06:00
Release preparation: fix package exports, tests, and documentation
Package exports: - Fix tinytorch/__init__.py to export all required components for milestones - Add Dense as alias for Linear for compatibility - Add loss functions (MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss) - Export spatial operations, data loaders, and transformer components Test infrastructure: - Create tests/conftest.py to handle path setup - Create tests/test_utils.py with shared test utilities - Rename test_progressive_integration.py files to include module number - Fix syntax errors in test files (spaces in class names) - Remove stale test file referencing non-existent modules Documentation: - Update README.md with correct milestone file names - Fix milestone requirements to match actual module dependencies Export system: - Run tito export --all to regenerate package from source modules - Ensure all 20 modules are properly exported
This commit is contained in:
16
README.md
16
README.md
@@ -317,26 +317,28 @@ tito module complete 01
|
||||
|
||||
As you complete modules, unlock historical ML milestones demonstrating YOUR implementations:
|
||||
|
||||
### 🧠 01. Perceptron (1957) - After Module 03
|
||||
### 🧠 01. Perceptron (1957) - After Module 07
|
||||
```bash
|
||||
cd milestones/01_1957_perceptron
|
||||
python perceptron_trained.py
|
||||
python 01_rosenblatt_forward.py # Forward pass demo (after Module 03)
|
||||
python 02_rosenblatt_trained.py # Training demo (after Module 07)
|
||||
# Rosenblatt's first trainable neural network
|
||||
# YOUR Linear layer + Sigmoid recreates history!
|
||||
```
|
||||
**Requirements**: Modules 01-03 (Tensor, Activations, Layers)
|
||||
**Requirements**: Modules 01-07 (Tensor through Training)
|
||||
**Achievement**: Binary classification with gradient descent
|
||||
|
||||
---
|
||||
|
||||
### ⚡ 02. XOR Crisis (1969) - After Module 05
|
||||
### ⚡ 02. XOR Crisis (1969) - After Module 07
|
||||
```bash
|
||||
cd milestones/02_1969_xor_crisis
|
||||
python xor_solved.py
|
||||
cd milestones/02_1969_xor
|
||||
python 01_xor_crisis.py # Demonstrate the problem
|
||||
python 02_xor_solved.py # Solve with hidden layers!
|
||||
# Solve Minsky's XOR challenge with hidden layers
|
||||
# YOUR autograd enables multi-layer learning!
|
||||
```
|
||||
**Requirements**: Modules 01-05 (+ Autograd)
|
||||
**Requirements**: Modules 01-07 (Tensor through Training)
|
||||
**Achievement**: Non-linear problem solving
|
||||
|
||||
---
|
||||
|
||||
@@ -14,7 +14,7 @@ from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
|
||||
class TestModule01Still Working:
|
||||
class TestModule01StillWorking:
|
||||
"""Verify Module 01 (Setup) functionality is still intact."""
|
||||
|
||||
def test_setup_environment_stable(self):
|
||||
@@ -88,7 +88,7 @@ class TestModule03ActivationsCore:
|
||||
assert True, "Module 02: Sigmoid not implemented yet"
|
||||
|
||||
|
||||
class TestProgressive StackIntegration:
|
||||
class TestProgressiveStackIntegration:
|
||||
"""Test that the full stack (01→02→03) works together."""
|
||||
|
||||
def test_tensor_activation_pipeline(self):
|
||||
@@ -1,334 +0,0 @@
|
||||
"""
|
||||
Integration Tests - Layers and Dense Networks
|
||||
|
||||
Tests cross-module interfaces and compatibility between individual Layers and Dense Network modules.
|
||||
Focuses on integration, not re-testing individual module functionality.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from test_utils import setup_integration_test
|
||||
|
||||
# Ensure proper setup before importing
|
||||
setup_integration_test()
|
||||
|
||||
# Import ONLY from TinyTorch package
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.layers import Dense
|
||||
from tinytorch.core.dense import Sequential, create_mlp, MLP
|
||||
from tinytorch.core.activations import ReLU, Sigmoid, Tanh
|
||||
|
||||
|
||||
class TestLayersDenseNetworkInterface:
|
||||
"""Test interface compatibility between individual Layers and Dense Networks."""
|
||||
|
||||
def test_dense_layer_to_sequential_network(self):
|
||||
"""Test that Dense layers can be integrated into Sequential networks."""
|
||||
# Create individual dense layers
|
||||
layer1 = Dense(input_size=4, output_size=8)
|
||||
layer2 = Dense(input_size=8, output_size=3)
|
||||
|
||||
# Test integration into Sequential
|
||||
network = Sequential([layer1, ReLU(), layer2])
|
||||
|
||||
# Test interface compatibility
|
||||
x = Tensor(np.random.randn(2, 4))
|
||||
result = network(x)
|
||||
|
||||
# Verify integration works
|
||||
assert isinstance(result, Tensor), "Sequential should work with Dense layers"
|
||||
assert result.shape == (2, 3), "Sequential should process through all layers"
|
||||
|
||||
def test_dense_layer_compatibility_with_mlp(self):
|
||||
"""Test that Dense layers are compatible with MLP construction."""
|
||||
# Test that MLP uses same interface as individual Dense layers
|
||||
individual_layer = Dense(input_size=6, output_size=10)
|
||||
mlp_network = create_mlp(input_size=6, hidden_sizes=[10], output_size=3)
|
||||
|
||||
# Test same input works with both
|
||||
x = Tensor(np.random.randn(1, 6))
|
||||
|
||||
# Individual layer output
|
||||
layer_output = individual_layer(x)
|
||||
|
||||
# MLP output (should accept same input)
|
||||
mlp_output = mlp_network(x)
|
||||
|
||||
# Verify interface compatibility
|
||||
assert isinstance(layer_output, Tensor), "Dense layer should return Tensor"
|
||||
assert isinstance(mlp_output, Tensor), "MLP should return Tensor"
|
||||
assert layer_output.shape == (1, 10), "Dense layer should have expected output shape"
|
||||
assert mlp_output.shape == (1, 3), "MLP should have expected output shape"
|
||||
|
||||
def test_layer_output_as_network_input(self):
|
||||
"""Test that Dense layer output can be used as network input."""
|
||||
# Create preprocessing layer
|
||||
preprocessor = Dense(input_size=5, output_size=8)
|
||||
|
||||
# Create network that processes preprocessor output
|
||||
network = Sequential([
|
||||
Dense(input_size=8, output_size=12),
|
||||
ReLU(),
|
||||
Dense(input_size=12, output_size=4)
|
||||
])
|
||||
|
||||
# Test pipeline: input → layer → network
|
||||
x = Tensor(np.random.randn(3, 5))
|
||||
preprocessed = preprocessor(x)
|
||||
final_output = network(preprocessed)
|
||||
|
||||
# Verify pipeline interface
|
||||
assert isinstance(preprocessed, Tensor), "Layer should produce Tensor for network"
|
||||
assert isinstance(final_output, Tensor), "Network should accept layer output"
|
||||
assert final_output.shape == (3, 4), "Pipeline should work end-to-end"
|
||||
|
||||
def test_network_layer_composition(self):
|
||||
"""Test that networks can be composed with individual layers."""
|
||||
# Create base network
|
||||
base_network = create_mlp(input_size=4, hidden_sizes=[6], output_size=8)
|
||||
|
||||
# Add additional processing layer
|
||||
final_layer = Dense(input_size=8, output_size=2)
|
||||
|
||||
# Test composition
|
||||
x = Tensor(np.random.randn(2, 4))
|
||||
|
||||
# Pipeline: input → network → layer
|
||||
network_output = base_network(x)
|
||||
final_output = final_layer(network_output)
|
||||
|
||||
# Verify composition interface
|
||||
assert isinstance(network_output, Tensor), "Network should produce Tensor for layer"
|
||||
assert isinstance(final_output, Tensor), "Layer should accept network output"
|
||||
assert network_output.shape == (2, 8), "Network output should have expected shape"
|
||||
assert final_output.shape == (2, 2), "Layer should process network output correctly"
|
||||
|
||||
|
||||
class TestLayerNetworkDataFlow:
|
||||
"""Test data flow compatibility between layers and networks."""
|
||||
|
||||
def test_shape_preservation_across_layer_network_boundary(self):
|
||||
"""Test shape preservation when crossing layer-network boundaries."""
|
||||
shape_configs = [
|
||||
(1, 4, 8, 2), # Single sample
|
||||
(5, 6, 10, 3), # Small batch
|
||||
(10, 8, 16, 4), # Larger batch
|
||||
]
|
||||
|
||||
for batch_size, input_size, hidden_size, output_size in shape_configs:
|
||||
# Create layer and network
|
||||
layer = Dense(input_size=input_size, output_size=hidden_size)
|
||||
network = Sequential([
|
||||
Dense(input_size=hidden_size, output_size=hidden_size),
|
||||
ReLU(),
|
||||
Dense(input_size=hidden_size, output_size=output_size)
|
||||
])
|
||||
|
||||
# Test data flow
|
||||
x = Tensor(np.random.randn(batch_size, input_size))
|
||||
layer_out = layer(x)
|
||||
network_out = network(layer_out)
|
||||
|
||||
# Verify shape flow
|
||||
assert layer_out.shape == (batch_size, hidden_size), f"Layer should output correct shape for config {shape_configs}"
|
||||
assert network_out.shape == (batch_size, output_size), f"Network should output correct shape for config {shape_configs}"
|
||||
|
||||
def test_dtype_preservation_across_layer_network_boundary(self):
|
||||
"""Test data type preservation across layer-network boundaries."""
|
||||
# Test float32 flow
|
||||
layer_f32 = Dense(input_size=4, output_size=6)
|
||||
network_f32 = create_mlp(input_size=6, hidden_sizes=[8], output_size=2)
|
||||
|
||||
x_f32 = Tensor(np.random.randn(2, 4).astype(np.float32))
|
||||
layer_out_f32 = layer_f32(x_f32)
|
||||
network_out_f32 = network_f32(layer_out_f32)
|
||||
|
||||
# Verify dtype preservation
|
||||
assert layer_out_f32.dtype == np.float32, "Layer should preserve float32"
|
||||
assert network_out_f32.dtype == np.float32, "Network should preserve float32 from layer"
|
||||
|
||||
# Test float64 flow
|
||||
layer_f64 = Dense(input_size=4, output_size=6)
|
||||
network_f64 = create_mlp(input_size=6, hidden_sizes=[8], output_size=2)
|
||||
|
||||
x_f64 = Tensor(np.random.randn(2, 4).astype(np.float64))
|
||||
layer_out_f64 = layer_f64(x_f64)
|
||||
network_out_f64 = network_f64(layer_out_f64)
|
||||
|
||||
# Verify dtype preservation
|
||||
assert layer_out_f64.dtype == np.float64, "Layer should preserve float64"
|
||||
assert network_out_f64.dtype == np.float64, "Network should preserve float64 from layer"
|
||||
|
||||
def test_error_handling_at_layer_network_boundary(self):
|
||||
"""Test error handling when layer-network interfaces are incompatible."""
|
||||
# Create mismatched layer and network
|
||||
layer = Dense(input_size=4, output_size=6)
|
||||
mismatched_network = Sequential([Dense(input_size=8, output_size=2)]) # Expects 8, gets 6
|
||||
|
||||
x = Tensor(np.random.randn(1, 4))
|
||||
layer_output = layer(x) # Shape (1, 6)
|
||||
|
||||
# Should fail gracefully with dimension mismatch
|
||||
try:
|
||||
result = mismatched_network(layer_output) # Expects (1, 8)
|
||||
assert False, "Should have failed with dimension mismatch"
|
||||
except (ValueError, AssertionError, TypeError) as e:
|
||||
# Expected behavior
|
||||
assert isinstance(e, (ValueError, AssertionError, TypeError)), "Should fail gracefully with dimension mismatch"
|
||||
|
||||
|
||||
class TestLayerNetworkSystemIntegration:
|
||||
"""Test system-level integration scenarios with layers and networks."""
|
||||
|
||||
def test_multi_stage_processing_pipeline(self):
|
||||
"""Test multi-stage processing using layers and networks."""
|
||||
# Stage 1: Preprocessing layer
|
||||
preprocessor = Dense(input_size=8, output_size=12)
|
||||
|
||||
# Stage 2: Feature extraction network
|
||||
feature_extractor = Sequential([
|
||||
Dense(input_size=12, output_size=16),
|
||||
ReLU(),
|
||||
Dense(input_size=16, output_size=10)
|
||||
])
|
||||
|
||||
# Stage 3: Classification layer
|
||||
classifier = Dense(input_size=10, output_size=3)
|
||||
|
||||
# Test complete pipeline
|
||||
x = Tensor(np.random.randn(4, 8))
|
||||
|
||||
preprocessed = preprocessor(x)
|
||||
features = feature_extractor(preprocessed)
|
||||
predictions = classifier(features)
|
||||
|
||||
# Verify multi-stage integration
|
||||
assert isinstance(preprocessed, Tensor), "Preprocessor should output Tensor"
|
||||
assert isinstance(features, Tensor), "Feature extractor should output Tensor"
|
||||
assert isinstance(predictions, Tensor), "Classifier should output Tensor"
|
||||
assert predictions.shape == (4, 3), "Pipeline should produce expected final shape"
|
||||
|
||||
def test_parallel_layer_processing(self):
|
||||
"""Test parallel processing with multiple layers feeding into network."""
|
||||
# Create parallel processing layers
|
||||
branch1 = Dense(input_size=6, output_size=4)
|
||||
branch2 = Dense(input_size=6, output_size=4)
|
||||
branch3 = Dense(input_size=6, output_size=4)
|
||||
|
||||
# Fusion network
|
||||
fusion_network = Sequential([
|
||||
Dense(input_size=12, output_size=8), # 4+4+4=12 from parallel branches
|
||||
ReLU(),
|
||||
Dense(input_size=8, output_size=2)
|
||||
])
|
||||
|
||||
# Test parallel processing
|
||||
x = Tensor(np.random.randn(2, 6))
|
||||
|
||||
# Process in parallel
|
||||
out1 = branch1(x)
|
||||
out2 = branch2(x)
|
||||
out3 = branch3(x)
|
||||
|
||||
# Manually concatenate (simulating fusion)
|
||||
# In a real implementation, this would be handled by a concatenation layer
|
||||
fused_data = np.concatenate([out1.data, out2.data, out3.data], axis=1)
|
||||
fused_tensor = Tensor(fused_data)
|
||||
|
||||
# Final processing
|
||||
final_output = fusion_network(fused_tensor)
|
||||
|
||||
# Verify parallel processing integration
|
||||
assert out1.shape == (2, 4), "Branch 1 should output correct shape"
|
||||
assert out2.shape == (2, 4), "Branch 2 should output correct shape"
|
||||
assert out3.shape == (2, 4), "Branch 3 should output correct shape"
|
||||
assert fused_tensor.shape == (2, 12), "Fusion should combine all branches"
|
||||
assert final_output.shape == (2, 2), "Final network should process fused input"
|
||||
|
||||
def test_layer_network_modularity(self):
|
||||
"""Test that layers and networks can be replaced modularly."""
|
||||
# Create modular components
|
||||
input_processors = [
|
||||
Dense(input_size=5, output_size=8),
|
||||
Dense(input_size=5, output_size=8), # Different instance
|
||||
]
|
||||
|
||||
core_networks = [
|
||||
create_mlp(input_size=8, hidden_sizes=[10], output_size=6),
|
||||
Sequential([Dense(input_size=8, output_size=6)]), # Different architecture
|
||||
]
|
||||
|
||||
output_processors = [
|
||||
Dense(input_size=6, output_size=3),
|
||||
Dense(input_size=6, output_size=3), # Different instance
|
||||
]
|
||||
|
||||
# Test all combinations work
|
||||
x = Tensor(np.random.randn(1, 5))
|
||||
|
||||
for input_proc in input_processors:
|
||||
for core_net in core_networks:
|
||||
for output_proc in output_processors:
|
||||
# Test modular pipeline
|
||||
intermediate1 = input_proc(x)
|
||||
intermediate2 = core_net(intermediate1)
|
||||
final = output_proc(intermediate2)
|
||||
|
||||
# Verify modularity
|
||||
assert isinstance(final, Tensor), "Modular combination should work"
|
||||
assert final.shape == (1, 3), "Modular combination should produce expected output"
|
||||
|
||||
|
||||
class TestLayerNetworkInterfaceStandards:
|
||||
"""Test that layers and networks follow consistent interface standards."""
|
||||
|
||||
def test_consistent_call_interface(self):
|
||||
"""Test that layers and networks have consistent callable interface."""
|
||||
# Create different components
|
||||
components = [
|
||||
Dense(input_size=4, output_size=6),
|
||||
Sequential([Dense(input_size=4, output_size=6)]),
|
||||
create_mlp(input_size=4, hidden_sizes=[8], output_size=6),
|
||||
MLP([4, 8, 6])
|
||||
]
|
||||
|
||||
x = Tensor(np.random.randn(1, 4))
|
||||
|
||||
# Test all components have consistent interface
|
||||
for component in components:
|
||||
# Should be callable with same signature
|
||||
result = component(x)
|
||||
|
||||
# Verify consistent interface
|
||||
assert isinstance(result, Tensor), f"{type(component).__name__} should return Tensor"
|
||||
assert result.shape[0] == 1, f"{type(component).__name__} should preserve batch dimension"
|
||||
assert result.shape[1] == 6, f"{type(component).__name__} should produce expected output size"
|
||||
|
||||
def test_component_property_consistency(self):
|
||||
"""Test that layers and networks have consistent properties."""
|
||||
# Create components
|
||||
layer = Dense(input_size=3, output_size=5)
|
||||
network = Sequential([Dense(input_size=3, output_size=5)])
|
||||
mlp = create_mlp(input_size=3, hidden_sizes=[], output_size=5)
|
||||
|
||||
# Test that all components can be used interchangeably
|
||||
x = Tensor(np.random.randn(2, 3))
|
||||
|
||||
results = []
|
||||
for component in [layer, network, mlp]:
|
||||
result = component(x)
|
||||
results.append(result)
|
||||
|
||||
# Verify consistent interface properties
|
||||
assert hasattr(result, 'shape'), f"{type(component).__name__} result should have shape"
|
||||
assert hasattr(result, 'data'), f"{type(component).__name__} result should have data"
|
||||
assert hasattr(result, 'dtype'), f"{type(component).__name__} result should have dtype"
|
||||
|
||||
# All should produce same output shape
|
||||
expected_shape = (2, 5)
|
||||
for i, result in enumerate(results):
|
||||
assert result.shape == expected_shape, f"Component {i} should produce consistent shape"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
29
tests/conftest.py
Normal file
29
tests/conftest.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""
|
||||
Pytest configuration for TinyTorch tests.
|
||||
|
||||
This file is automatically loaded by pytest and sets up the test environment.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add tests directory to Python path so test_utils can be imported
|
||||
tests_dir = Path(__file__).parent
|
||||
if str(tests_dir) not in sys.path:
|
||||
sys.path.insert(0, str(tests_dir))
|
||||
|
||||
# Add project root to Python path
|
||||
project_root = tests_dir.parent
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
# Set quiet mode for tinytorch imports during tests
|
||||
os.environ['TINYTORCH_QUIET'] = '1'
|
||||
|
||||
# Import test utilities to make them available
|
||||
try:
|
||||
from test_utils import setup_integration_test, create_test_tensor, assert_tensors_close
|
||||
except ImportError:
|
||||
pass # test_utils not yet created or has issues
|
||||
|
||||
114
tests/test_utils.py
Normal file
114
tests/test_utils.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
TinyTorch Test Utilities
|
||||
|
||||
Shared utilities for integration tests across all modules.
|
||||
Provides setup functions and common test helpers.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def setup_integration_test():
|
||||
"""
|
||||
Set up the environment for integration testing.
|
||||
|
||||
This function ensures:
|
||||
1. The TinyTorch package is importable
|
||||
2. NumPy random seed is set for reproducibility
|
||||
3. Warning filters are set appropriately
|
||||
|
||||
Call this at the top of integration test files before importing TinyTorch.
|
||||
"""
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
# Ensure tinytorch is on the path (from project root)
|
||||
project_root = Path(__file__).parent.parent
|
||||
if str(project_root) not in sys.path:
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
# Set random seed for reproducibility
|
||||
np.random.seed(42)
|
||||
|
||||
# Suppress certain warnings during tests
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
warnings.filterwarnings('ignore', category=FutureWarning)
|
||||
|
||||
# Set quiet mode for tinytorch imports during tests
|
||||
os.environ['TINYTORCH_QUIET'] = '1'
|
||||
|
||||
|
||||
def get_project_root() -> Path:
|
||||
"""Return the project root directory."""
|
||||
return Path(__file__).parent.parent
|
||||
|
||||
|
||||
def get_test_data_path() -> Path:
|
||||
"""Return the path to test data directory."""
|
||||
return get_project_root() / "datasets"
|
||||
|
||||
|
||||
def create_test_tensor(shape, requires_grad=True, seed=None):
|
||||
"""
|
||||
Create a test tensor with random data.
|
||||
|
||||
Args:
|
||||
shape: Tuple specifying tensor shape
|
||||
requires_grad: Whether tensor should track gradients
|
||||
seed: Optional random seed for reproducibility
|
||||
|
||||
Returns:
|
||||
Tensor with random data
|
||||
"""
|
||||
import numpy as np
|
||||
from tinytorch.core.tensor import Tensor
|
||||
|
||||
if seed is not None:
|
||||
np.random.seed(seed)
|
||||
|
||||
data = np.random.randn(*shape).astype(np.float32)
|
||||
return Tensor(data, requires_grad=requires_grad)
|
||||
|
||||
|
||||
def assert_tensors_close(t1, t2, rtol=1e-5, atol=1e-8, msg=""):
|
||||
"""
|
||||
Assert that two tensors are element-wise close.
|
||||
|
||||
Args:
|
||||
t1: First tensor
|
||||
t2: Second tensor
|
||||
rtol: Relative tolerance
|
||||
atol: Absolute tolerance
|
||||
msg: Optional message for assertion error
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
# Extract data from tensors if needed
|
||||
data1 = t1.data if hasattr(t1, 'data') else t1
|
||||
data2 = t2.data if hasattr(t2, 'data') else t2
|
||||
|
||||
if not np.allclose(data1, data2, rtol=rtol, atol=atol):
|
||||
diff = np.abs(data1 - data2)
|
||||
max_diff = np.max(diff)
|
||||
raise AssertionError(
|
||||
f"Tensors not close (max diff: {max_diff:.6e}). {msg}"
|
||||
)
|
||||
|
||||
|
||||
def assert_gradients_exist(tensor, msg=""):
|
||||
"""Assert that a tensor has computed gradients."""
|
||||
if tensor.grad is None:
|
||||
raise AssertionError(f"Tensor has no gradients. {msg}")
|
||||
|
||||
|
||||
def skip_if_no_tinytorch():
|
||||
"""Pytest skip decorator for when tinytorch isn't available."""
|
||||
import pytest
|
||||
try:
|
||||
import tinytorch
|
||||
return pytest.mark.skipif(False, reason="TinyTorch available")
|
||||
except ImportError:
|
||||
return pytest.mark.skip(reason="TinyTorch not installed")
|
||||
|
||||
99
tinytorch/__init__.py
generated
99
tinytorch/__init__.py
generated
@@ -1,28 +1,97 @@
|
||||
"""
|
||||
TinyTorch - Build ML Systems From First Principles
|
||||
|
||||
A complete educational ML framework for learning neural network internals
|
||||
by implementing everything from scratch.
|
||||
|
||||
Top-level exports provide easy access to commonly used components.
|
||||
For advanced modules (optimization, profiling), import from submodules:
|
||||
from tinytorch.profiling.profiler import Profiler
|
||||
from tinytorch.optimization.quantization import quantize_int8
|
||||
from tinytorch.generation.kv_cache import enable_kv_cache
|
||||
"""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
|
||||
# Import core functionality
|
||||
from . import core
|
||||
|
||||
# Make common components easily accessible at top level
|
||||
# ============================================================================
|
||||
# Core Functionality (Modules 01-07)
|
||||
# ============================================================================
|
||||
from .core.tensor import Tensor
|
||||
from .core.layers import Linear, Dropout
|
||||
from .core.activations import Sigmoid, ReLU, Tanh, GELU, Softmax
|
||||
# from .core.losses import MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss # TEMP: removed for testing
|
||||
from .core.optimizers import SGD, AdamW
|
||||
from .core.layers import Layer, Linear, Dense, Dropout
|
||||
from .core.losses import MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss
|
||||
from .core.optimizers import SGD, Adam, AdamW
|
||||
from .core.training import Trainer, CosineSchedule, clip_grad_norm
|
||||
|
||||
# 🔥 CRITICAL: Enable automatic differentiation
|
||||
# This patches Tensor operations to track gradients
|
||||
# Use quiet=True when imported by CLI tools to avoid cluttering output
|
||||
# ============================================================================
|
||||
# Data Loading (Module 08)
|
||||
# ============================================================================
|
||||
from .data.loader import Dataset, TensorDataset, DataLoader
|
||||
|
||||
# ============================================================================
|
||||
# Spatial Operations (Module 09)
|
||||
# ============================================================================
|
||||
from .core.spatial import Conv2d, MaxPool2d
|
||||
|
||||
# ============================================================================
|
||||
# Text Processing (Modules 10-11)
|
||||
# ============================================================================
|
||||
from .text.tokenization import Tokenizer, CharTokenizer, BPETokenizer
|
||||
from .text.embeddings import Embedding, PositionalEncoding, EmbeddingLayer
|
||||
|
||||
# ============================================================================
|
||||
# Attention & Transformers (Modules 12-13)
|
||||
# ============================================================================
|
||||
from .core.attention import MultiHeadAttention, scaled_dot_product_attention
|
||||
from .models.transformer import LayerNorm, MLP, TransformerBlock, GPT
|
||||
|
||||
# ============================================================================
|
||||
# Enable Autograd (CRITICAL - must happen after imports)
|
||||
# ============================================================================
|
||||
import os
|
||||
from .core.autograd import enable_autograd
|
||||
|
||||
# Enable autograd quietly when imported by CLI tools
|
||||
enable_autograd(quiet=os.environ.get('TINYTORCH_QUIET', '').lower() in ('1', 'true', 'yes'))
|
||||
|
||||
# Export main public API
|
||||
# ============================================================================
|
||||
# Public API
|
||||
# ============================================================================
|
||||
__all__ = [
|
||||
'core',
|
||||
# Version
|
||||
'__version__',
|
||||
|
||||
# Core - Tensor
|
||||
'Tensor',
|
||||
'Linear', 'Dropout',
|
||||
|
||||
# Core - Activations
|
||||
'Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax',
|
||||
# 'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss', # TEMP: removed for testing
|
||||
'SGD', 'AdamW'
|
||||
|
||||
# Core - Layers
|
||||
'Layer', 'Linear', 'Dense', 'Dropout',
|
||||
|
||||
# Core - Losses
|
||||
'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss',
|
||||
|
||||
# Core - Optimizers
|
||||
'SGD', 'Adam', 'AdamW',
|
||||
|
||||
# Core - Training
|
||||
'Trainer', 'CosineSchedule', 'clip_grad_norm',
|
||||
|
||||
# Data Loading
|
||||
'Dataset', 'TensorDataset', 'DataLoader',
|
||||
|
||||
# Core - Spatial (CNN)
|
||||
'Conv2d', 'MaxPool2d',
|
||||
|
||||
# Text/NLP
|
||||
'Tokenizer', 'CharTokenizer', 'BPETokenizer',
|
||||
'Embedding', 'PositionalEncoding', 'EmbeddingLayer',
|
||||
|
||||
# Core - Attention
|
||||
'MultiHeadAttention', 'scaled_dot_product_attention',
|
||||
|
||||
# Models
|
||||
'LayerNorm', 'MLP', 'TransformerBlock', 'GPT',
|
||||
]
|
||||
|
||||
98
tinytorch/_modidx.py
generated
98
tinytorch/_modidx.py
generated
@@ -63,6 +63,14 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.Benchmark.run_memory_benchmark': ( '19_benchmarking/benchmarking.html#benchmark.run_memory_benchmark',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.BenchmarkResult': ( '19_benchmarking/benchmarking.html#benchmarkresult',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.BenchmarkResult.__post_init__': ( '19_benchmarking/benchmarking.html#benchmarkresult.__post_init__',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.BenchmarkResult.__str__': ( '19_benchmarking/benchmarking.html#benchmarkresult.__str__',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.BenchmarkResult.to_dict': ( '19_benchmarking/benchmarking.html#benchmarkresult.to_dict',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.BenchmarkSuite': ( '19_benchmarking/benchmarking.html#benchmarksuite',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.BenchmarkSuite.__init__': ( '19_benchmarking/benchmarking.html#benchmarksuite.__init__',
|
||||
@@ -89,10 +97,33 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.test_unit_benchmark': ( '19_benchmarking/benchmarking.html#test_unit_benchmark',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.test_unit_benchmark_result': ( '19_benchmarking/benchmarking.html#test_unit_benchmark_result',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.test_unit_benchmark_suite': ( '19_benchmarking/benchmarking.html#test_unit_benchmark_suite',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.test_unit_tinymlperf': ( '19_benchmarking/benchmarking.html#test_unit_tinymlperf',
|
||||
'tinytorch/benchmarking/benchmark.py')},
|
||||
'tinytorch.capstone': { 'tinytorch.capstone.BenchmarkReport': ( '20_capstone/capstone.html#benchmarkreport',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.BenchmarkReport.__init__': ( '20_capstone/capstone.html#benchmarkreport.__init__',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.BenchmarkReport._get_system_info': ( '20_capstone/capstone.html#benchmarkreport._get_system_info',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.BenchmarkReport.benchmark_model': ( '20_capstone/capstone.html#benchmarkreport.benchmark_model',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.SimpleMLP': ('20_capstone/capstone.html#simplemlp', 'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.SimpleMLP.__init__': ( '20_capstone/capstone.html#simplemlp.__init__',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.SimpleMLP.count_parameters': ( '20_capstone/capstone.html#simplemlp.count_parameters',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.SimpleMLP.forward': ( '20_capstone/capstone.html#simplemlp.forward',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.SimpleMLP.parameters': ( '20_capstone/capstone.html#simplemlp.parameters',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.generate_submission': ( '20_capstone/capstone.html#generate_submission',
|
||||
'tinytorch/capstone.py'),
|
||||
'tinytorch.capstone.save_submission': ( '20_capstone/capstone.html#save_submission',
|
||||
'tinytorch/capstone.py')},
|
||||
'tinytorch.competition.submit': { 'tinytorch.competition.submit.generate_baseline': ( 'source/20_competition/competition_dev.html#generate_baseline',
|
||||
'tinytorch/competition/submit.py'),
|
||||
'tinytorch.competition.submit.generate_submission': ( 'source/20_competition/competition_dev.html#generate_submission',
|
||||
@@ -115,6 +146,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.GELU.forward': ( '02_activations/activations.html#gelu.forward',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.GELU.parameters': ( '02_activations/activations.html#gelu.parameters',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.ReLU': ( '02_activations/activations.html#relu',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.ReLU.__call__': ( '02_activations/activations.html#relu.__call__',
|
||||
@@ -123,6 +156,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.ReLU.forward': ( '02_activations/activations.html#relu.forward',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.ReLU.parameters': ( '02_activations/activations.html#relu.parameters',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Sigmoid': ( '02_activations/activations.html#sigmoid',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Sigmoid.__call__': ( '02_activations/activations.html#sigmoid.__call__',
|
||||
@@ -131,6 +166,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Sigmoid.forward': ( '02_activations/activations.html#sigmoid.forward',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Sigmoid.parameters': ( '02_activations/activations.html#sigmoid.parameters',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Softmax': ( '02_activations/activations.html#softmax',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Softmax.__call__': ( '02_activations/activations.html#softmax.__call__',
|
||||
@@ -139,6 +176,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Softmax.forward': ( '02_activations/activations.html#softmax.forward',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Softmax.parameters': ( '02_activations/activations.html#softmax.parameters',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Tanh': ( '02_activations/activations.html#tanh',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Tanh.__call__': ( '02_activations/activations.html#tanh.__call__',
|
||||
@@ -146,7 +185,9 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch.core.activations.Tanh.backward': ( '02_activations/activations.html#tanh.backward',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Tanh.forward': ( '02_activations/activations.html#tanh.forward',
|
||||
'tinytorch/core/activations.py')},
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Tanh.parameters': ( '02_activations/activations.html#tanh.parameters',
|
||||
'tinytorch/core/activations.py')},
|
||||
'tinytorch.core.attention': { 'tinytorch.core.attention.MultiHeadAttention': ( '12_attention/attention.html#multiheadattention',
|
||||
'tinytorch/core/attention.py'),
|
||||
'tinytorch.core.attention.MultiHeadAttention.__call__': ( '12_attention/attention.html#multiheadattention.__call__',
|
||||
@@ -264,6 +305,20 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.AvgPool2d.parameters': ( '09_spatial/spatial.html#avgpool2d.parameters',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.BatchNorm2d': ( '09_spatial/spatial.html#batchnorm2d',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.BatchNorm2d.__call__': ( '09_spatial/spatial.html#batchnorm2d.__call__',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.BatchNorm2d.__init__': ( '09_spatial/spatial.html#batchnorm2d.__init__',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.BatchNorm2d.eval': ( '09_spatial/spatial.html#batchnorm2d.eval',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.BatchNorm2d.forward': ( '09_spatial/spatial.html#batchnorm2d.forward',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.BatchNorm2d.parameters': ( '09_spatial/spatial.html#batchnorm2d.parameters',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.BatchNorm2d.train': ( '09_spatial/spatial.html#batchnorm2d.train',
|
||||
'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.Conv2d': ('09_spatial/spatial.html#conv2d', 'tinytorch/core/spatial.py'),
|
||||
'tinytorch.core.spatial.Conv2d.__call__': ( '09_spatial/spatial.html#conv2d.__call__',
|
||||
'tinytorch/core/spatial.py'),
|
||||
@@ -367,8 +422,16 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training.html#trainer.save_checkpoint',
|
||||
'tinytorch/core/training.py'),
|
||||
'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training.html#trainer.train_epoch',
|
||||
'tinytorch/core/training.py')},
|
||||
'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader.html#dataloader',
|
||||
'tinytorch/core/training.py'),
|
||||
'tinytorch.core.training.clip_grad_norm': ( '07_training/training.html#clip_grad_norm',
|
||||
'tinytorch/core/training.py')},
|
||||
'tinytorch.data.loader': { 'tinytorch.data.loader.Compose': ( '08_dataloader/dataloader.html#compose',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.Compose.__call__': ( '08_dataloader/dataloader.html#compose.__call__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.Compose.__init__': ( '08_dataloader/dataloader.html#compose.__init__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader.html#dataloader',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader.html#dataloader.__init__',
|
||||
'tinytorch/data/loader.py'),
|
||||
@@ -384,6 +447,18 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.Dataset.__len__': ( '08_dataloader/dataloader.html#dataset.__len__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.RandomCrop': ( '08_dataloader/dataloader.html#randomcrop',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.RandomCrop.__call__': ( '08_dataloader/dataloader.html#randomcrop.__call__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.RandomCrop.__init__': ( '08_dataloader/dataloader.html#randomcrop.__init__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.RandomHorizontalFlip': ( '08_dataloader/dataloader.html#randomhorizontalflip',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.RandomHorizontalFlip.__call__': ( '08_dataloader/dataloader.html#randomhorizontalflip.__call__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.RandomHorizontalFlip.__init__': ( '08_dataloader/dataloader.html#randomhorizontalflip.__init__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.TensorDataset': ( '08_dataloader/dataloader.html#tensordataset',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.TensorDataset.__getitem__': ( '08_dataloader/dataloader.html#tensordataset.__getitem__',
|
||||
@@ -406,6 +481,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/generation/kv_cache.py'),
|
||||
'tinytorch.generation.kv_cache.KVCache.update': ( '17_memoization/memoization.html#kvcache.update',
|
||||
'tinytorch/generation/kv_cache.py'),
|
||||
'tinytorch.generation.kv_cache.create_kv_cache': ( '17_memoization/memoization.html#create_kv_cache',
|
||||
'tinytorch/generation/kv_cache.py'),
|
||||
'tinytorch.generation.kv_cache.disable_kv_cache': ( '17_memoization/memoization.html#disable_kv_cache',
|
||||
'tinytorch/generation/kv_cache.py'),
|
||||
'tinytorch.generation.kv_cache.enable_kv_cache': ( '17_memoization/memoization.html#enable_kv_cache',
|
||||
@@ -454,7 +531,12 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers.html#transformerblock.parameters',
|
||||
'tinytorch/models/transformer.py')},
|
||||
'tinytorch.optimization.acceleration': {},
|
||||
'tinytorch.optimization.acceleration': { 'tinytorch.optimization.acceleration.fused_gelu': ( '18_acceleration/acceleration.html#fused_gelu',
|
||||
'tinytorch/optimization/acceleration.py'),
|
||||
'tinytorch.optimization.acceleration.tiled_matmul': ( '18_acceleration/acceleration.html#tiled_matmul',
|
||||
'tinytorch/optimization/acceleration.py'),
|
||||
'tinytorch.optimization.acceleration.vectorized_matmul': ( '18_acceleration/acceleration.html#vectorized_matmul',
|
||||
'tinytorch/optimization/acceleration.py')},
|
||||
'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.CompressionComplete': ( '16_compression/compression.html#compressioncomplete',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.CompressionComplete.compress_model': ( '16_compression/compression.html#compressioncomplete.compress_model',
|
||||
@@ -479,6 +561,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.compress_model': ( '16_compression/compression.html#compress_model',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.low_rank_approximate': ( '16_compression/compression.html#low_rank_approximate',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.magnitude_prune': ( '16_compression/compression.html#magnitude_prune',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.measure_sparsity': ( '16_compression/compression.html#measure_sparsity',
|
||||
@@ -515,6 +599,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.SimpleModel.forward': ( '15_quantization/quantization.html#simplemodel.forward',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.compare_model_sizes': ( '15_quantization/quantization.html#compare_model_sizes',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.dequantize_int8': ( '15_quantization/quantization.html#dequantize_int8',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.quantize_int8': ( '15_quantization/quantization.html#quantize_int8',
|
||||
@@ -578,7 +664,9 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch.text.embeddings.PositionalEncoding.forward': ( '11_embeddings/embeddings.html#positionalencoding.forward',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding.parameters': ( '11_embeddings/embeddings.html#positionalencoding.parameters',
|
||||
'tinytorch/text/embeddings.py')},
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.create_sinusoidal_embeddings': ( '11_embeddings/embeddings.html#create_sinusoidal_embeddings',
|
||||
'tinytorch/text/embeddings.py')},
|
||||
'tinytorch.text.tokenization': { 'tinytorch.text.tokenization.BPETokenizer': ( '10_tokenization/tokenization.html#bpetokenizer',
|
||||
'tinytorch/text/tokenization.py'),
|
||||
'tinytorch.text.tokenization.BPETokenizer.__init__': ( '10_tokenization/tokenization.html#bpetokenizer.__init__',
|
||||
|
||||
106
tinytorch/benchmarking/benchmark.py
generated
106
tinytorch/benchmarking/benchmark.py
generated
@@ -15,14 +15,116 @@
|
||||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['DEFAULT_WARMUP_RUNS', 'DEFAULT_MEASUREMENT_RUNS', 'Benchmark', 'test_unit_benchmark', 'BenchmarkSuite',
|
||||
'test_unit_benchmark_suite', 'TinyMLPerf', 'test_unit_tinymlperf']
|
||||
__all__ = ['DEFAULT_WARMUP_RUNS', 'DEFAULT_MEASUREMENT_RUNS', 'BenchmarkResult', 'test_unit_benchmark_result', 'Benchmark',
|
||||
'test_unit_benchmark', 'BenchmarkSuite', 'test_unit_benchmark_suite', 'TinyMLPerf', 'test_unit_tinymlperf']
|
||||
|
||||
# %% ../../modules/19_benchmarking/19_benchmarking.ipynb 0
|
||||
# Constants for benchmarking defaults
|
||||
DEFAULT_WARMUP_RUNS = 5 # Default warmup runs for JIT compilation and cache warming
|
||||
DEFAULT_MEASUREMENT_RUNS = 10 # Default measurement runs for statistical significance
|
||||
|
||||
# %% ../../modules/19_benchmarking/19_benchmarking.ipynb 9
|
||||
@dataclass
|
||||
class BenchmarkResult:
|
||||
"""
|
||||
Container for benchmark measurements with statistical analysis.
|
||||
|
||||
TODO: Implement a robust result container that stores measurements and metadata
|
||||
|
||||
APPROACH:
|
||||
1. Store raw measurements and computed statistics
|
||||
2. Include metadata about test conditions
|
||||
3. Provide methods for statistical analysis
|
||||
4. Support serialization for result persistence
|
||||
|
||||
EXAMPLE:
|
||||
>>> result = BenchmarkResult("model_accuracy", [0.95, 0.94, 0.96])
|
||||
>>> print(f"Mean: {result.mean:.3f} ± {result.std:.3f}")
|
||||
Mean: 0.950 ± 0.010
|
||||
|
||||
HINTS:
|
||||
- Use statistics module for robust mean/std calculations
|
||||
- Store both raw data and summary statistics
|
||||
- Include confidence intervals for professional reporting
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
metric_name: str
|
||||
values: List[float]
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""Compute statistics after initialization."""
|
||||
if not self.values:
|
||||
raise ValueError(
|
||||
"BenchmarkResult requires at least one measurement.\n"
|
||||
" Issue: Cannot compute statistics without any measurements.\n"
|
||||
" Fix: Ensure benchmark runs produce at least one measurement before creating BenchmarkResult."
|
||||
)
|
||||
|
||||
self.mean = statistics.mean(self.values)
|
||||
self.std = statistics.stdev(self.values) if len(self.values) > 1 else 0.0
|
||||
self.median = statistics.median(self.values)
|
||||
self.min_val = min(self.values)
|
||||
self.max_val = max(self.values)
|
||||
self.count = len(self.values)
|
||||
|
||||
# 95% confidence interval for the mean
|
||||
if len(self.values) > 1:
|
||||
t_score = 1.96 # Approximate for large samples
|
||||
margin_error = t_score * (self.std / np.sqrt(self.count))
|
||||
self.ci_lower = self.mean - margin_error
|
||||
self.ci_upper = self.mean + margin_error
|
||||
else:
|
||||
self.ci_lower = self.ci_upper = self.mean
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for serialization."""
|
||||
return {
|
||||
'metric_name': self.metric_name,
|
||||
'values': self.values,
|
||||
'mean': self.mean,
|
||||
'std': self.std,
|
||||
'median': self.median,
|
||||
'min': self.min_val,
|
||||
'max': self.max_val,
|
||||
'count': self.count,
|
||||
'ci_lower': self.ci_lower,
|
||||
'ci_upper': self.ci_upper,
|
||||
'metadata': self.metadata
|
||||
}
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.metric_name}: {self.mean:.4f} ± {self.std:.4f} (n={self.count})"
|
||||
### END SOLUTION
|
||||
|
||||
def test_unit_benchmark_result():
|
||||
"""🔬 Test BenchmarkResult statistical calculations."""
|
||||
print("🔬 Unit Test: BenchmarkResult...")
|
||||
|
||||
# Test basic statistics
|
||||
values = [1.0, 2.0, 3.0, 4.0, 5.0]
|
||||
result = BenchmarkResult("test_metric", values)
|
||||
|
||||
assert result.mean == 3.0
|
||||
assert abs(result.std - statistics.stdev(values)) < 1e-10
|
||||
assert result.median == 3.0
|
||||
assert result.min_val == 1.0
|
||||
assert result.max_val == 5.0
|
||||
assert result.count == 5
|
||||
|
||||
# Test confidence intervals
|
||||
assert result.ci_lower < result.mean < result.ci_upper
|
||||
|
||||
# Test serialization
|
||||
result_dict = result.to_dict()
|
||||
assert result_dict['metric_name'] == "test_metric"
|
||||
assert result_dict['mean'] == 3.0
|
||||
|
||||
print("✅ BenchmarkResult works correctly!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_unit_benchmark_result()
|
||||
|
||||
# %% ../../modules/19_benchmarking/19_benchmarking.ipynb 13
|
||||
class Benchmark:
|
||||
"""
|
||||
|
||||
9
tinytorch/core/attention.py
generated
9
tinytorch/core/attention.py
generated
@@ -293,11 +293,10 @@ class MultiHeadAttention:
|
||||
mask_reshaped = mask
|
||||
if mask is not None and len(mask.shape) == 3:
|
||||
# Add head dimension: (batch, seq, seq) -> (batch, 1, seq, seq)
|
||||
# Note: Tensor.reshape doesn't support adding dims easily without full shape
|
||||
# But we can use numpy reshape on data and wrap in Tensor?
|
||||
# Or just rely on broadcasting if mask is 2D?
|
||||
# In the proof script, mask is None, so this is fine.
|
||||
pass
|
||||
# This allows the mask to broadcast across all attention heads
|
||||
batch_size_mask, seq_len_mask, _ = mask.shape
|
||||
mask_data = mask.data.reshape(batch_size_mask, 1, seq_len_mask, seq_len_mask)
|
||||
mask_reshaped = Tensor(mask_data, requires_grad=False)
|
||||
|
||||
attended, _ = scaled_dot_product_attention(Q, K, V, mask=mask_reshaped)
|
||||
|
||||
|
||||
5
tinytorch/core/autograd.py
generated
5
tinytorch/core/autograd.py
generated
@@ -446,6 +446,7 @@ class EmbeddingBackward(Function):
|
||||
|
||||
return (grad_weight,)
|
||||
|
||||
#| export
|
||||
|
||||
class SliceBackward(Function):
|
||||
"""
|
||||
@@ -1298,6 +1299,6 @@ def enable_autograd(quiet=False):
|
||||
print(" - requires_grad=True enables tracking")
|
||||
|
||||
# Auto-enable when module is imported
|
||||
# Check TINYTORCH_QUIET env var to suppress messages (for CLI tools)
|
||||
# Always quiet to avoid cluttering user imports
|
||||
import os
|
||||
enable_autograd(quiet=os.environ.get('TINYTORCH_QUIET', '').lower() in ('1', 'true', 'yes'))
|
||||
enable_autograd(quiet=True)
|
||||
|
||||
6
tinytorch/core/layers.py
generated
6
tinytorch/core/layers.py
generated
@@ -15,7 +15,7 @@
|
||||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['XAVIER_SCALE_FACTOR', 'HE_SCALE_FACTOR', 'DROPOUT_MIN_PROB', 'DROPOUT_MAX_PROB', 'Layer', 'Linear', 'Dropout']
|
||||
__all__ = ['XAVIER_SCALE_FACTOR', 'HE_SCALE_FACTOR', 'DROPOUT_MIN_PROB', 'DROPOUT_MAX_PROB', 'Layer', 'Linear', 'Dense', 'Dropout']
|
||||
|
||||
# %% ../../modules/03_layers/03_layers.ipynb 1
|
||||
import numpy as np
|
||||
@@ -273,3 +273,7 @@ class Dropout(Layer):
|
||||
|
||||
def __repr__(self):
|
||||
return f"Dropout(p={self.p})"
|
||||
|
||||
# Alias for compatibility - Dense is the same as Linear
|
||||
# Some frameworks use Dense, some use Linear - they're identical
|
||||
Dense = Linear
|
||||
|
||||
27
tinytorch/core/optimizers.py
generated
27
tinytorch/core/optimizers.py
generated
@@ -240,9 +240,14 @@ class SGD(Optimizer):
|
||||
if param.grad is None:
|
||||
continue
|
||||
|
||||
# Get gradient data (grad is a Tensor from Module 01)
|
||||
# Get gradient data - grad can be Tensor or numpy array
|
||||
grad = param.grad
|
||||
grad_data = grad.data
|
||||
# Handle both Tensor (with .data) and numpy array (from autograd) cases
|
||||
if isinstance(grad, Tensor):
|
||||
grad_data = grad.data
|
||||
else:
|
||||
# grad is already a numpy array from autograd
|
||||
grad_data = grad
|
||||
|
||||
# Apply weight decay
|
||||
if self.weight_decay != 0:
|
||||
@@ -342,9 +347,14 @@ class Adam(Optimizer):
|
||||
if param.grad is None:
|
||||
continue
|
||||
|
||||
# Get gradient data (grad is a Tensor from Module 01)
|
||||
# Get gradient data - grad can be Tensor or numpy array
|
||||
grad = param.grad
|
||||
grad_data = grad.data
|
||||
# Handle both Tensor (with .data) and numpy array (from autograd) cases
|
||||
if isinstance(grad, Tensor):
|
||||
grad_data = grad.data
|
||||
else:
|
||||
# grad is already a numpy array from autograd
|
||||
grad_data = grad
|
||||
|
||||
# Apply weight decay
|
||||
if self.weight_decay != 0:
|
||||
@@ -446,9 +456,14 @@ class AdamW(Optimizer):
|
||||
if param.grad is None:
|
||||
continue
|
||||
|
||||
# Get gradient data (NOT modified by weight decay)
|
||||
# Get gradient data - grad can be Tensor or numpy array
|
||||
grad = param.grad
|
||||
grad_data = grad.data
|
||||
# Handle both Tensor (with .data) and numpy array (from autograd) cases
|
||||
if isinstance(grad, Tensor):
|
||||
grad_data = grad.data
|
||||
else:
|
||||
# grad is already a numpy array from autograd
|
||||
grad_data = grad
|
||||
|
||||
# Initialize buffers if needed
|
||||
if self.m_buffers[i] is None:
|
||||
|
||||
159
tinytorch/core/spatial.py
generated
159
tinytorch/core/spatial.py
generated
@@ -16,7 +16,7 @@
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['DEFAULT_KERNEL_SIZE', 'DEFAULT_STRIDE', 'DEFAULT_PADDING', 'Conv2dBackward', 'Conv2d', 'MaxPool2dBackward',
|
||||
'MaxPool2d', 'AvgPool2d', 'SimpleCNN']
|
||||
'MaxPool2d', 'AvgPool2d', 'BatchNorm2d', 'SimpleCNN']
|
||||
|
||||
# %% ../../modules/09_spatial/09_spatial.ipynb 1
|
||||
import numpy as np
|
||||
@@ -133,6 +133,7 @@ class Conv2dBackward(Function):
|
||||
# Following TinyTorch protocol: return (grad_input, grad_weight, grad_bias)
|
||||
return grad_input, grad_weight, grad_bias
|
||||
|
||||
#| export
|
||||
|
||||
class Conv2d:
|
||||
"""
|
||||
@@ -392,6 +393,7 @@ class MaxPool2dBackward(Function):
|
||||
# Return as tuple (following Function protocol)
|
||||
return (grad_input,)
|
||||
|
||||
#| export
|
||||
|
||||
class MaxPool2d:
|
||||
"""
|
||||
@@ -662,7 +664,160 @@ class AvgPool2d:
|
||||
"""Enable model(x) syntax."""
|
||||
return self.forward(x)
|
||||
|
||||
# %% ../../modules/09_spatial/09_spatial.ipynb 21
|
||||
# %% ../../modules/09_spatial/09_spatial.ipynb 15
|
||||
class BatchNorm2d:
|
||||
"""
|
||||
Batch Normalization for 2D spatial inputs (images).
|
||||
|
||||
Normalizes activations across batch and spatial dimensions for each channel,
|
||||
then applies learnable scale (gamma) and shift (beta) parameters.
|
||||
|
||||
Key behaviors:
|
||||
- Training: Uses batch statistics, updates running statistics
|
||||
- Eval: Uses frozen running statistics for consistent inference
|
||||
|
||||
Args:
|
||||
num_features: Number of channels (C in NCHW format)
|
||||
eps: Small constant for numerical stability (default: 1e-5)
|
||||
momentum: Momentum for running statistics update (default: 0.1)
|
||||
"""
|
||||
|
||||
def __init__(self, num_features, eps=1e-5, momentum=0.1):
|
||||
"""
|
||||
Initialize BatchNorm2d layer.
|
||||
|
||||
TODO: Initialize learnable and running parameters
|
||||
|
||||
APPROACH:
|
||||
1. Store hyperparameters (num_features, eps, momentum)
|
||||
2. Initialize gamma (scale) to ones - identity at start
|
||||
3. Initialize beta (shift) to zeros - no shift at start
|
||||
4. Initialize running_mean to zeros
|
||||
5. Initialize running_var to ones
|
||||
6. Set training mode to True initially
|
||||
|
||||
EXAMPLE:
|
||||
>>> bn = BatchNorm2d(64) # For 64-channel feature maps
|
||||
>>> print(bn.gamma.shape) # (64,)
|
||||
>>> print(bn.training) # True
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
### BEGIN SOLUTION
|
||||
self.num_features = num_features
|
||||
self.eps = eps
|
||||
self.momentum = momentum
|
||||
|
||||
# Learnable parameters (requires_grad=True for training)
|
||||
# gamma (scale): initialized to 1 so output = normalized input initially
|
||||
self.gamma = Tensor(np.ones(num_features), requires_grad=True)
|
||||
# beta (shift): initialized to 0 so no shift initially
|
||||
self.beta = Tensor(np.zeros(num_features), requires_grad=True)
|
||||
|
||||
# Running statistics (not trained, accumulated during training)
|
||||
# These are used during evaluation for consistent normalization
|
||||
self.running_mean = np.zeros(num_features)
|
||||
self.running_var = np.ones(num_features)
|
||||
|
||||
# Training mode flag
|
||||
self.training = True
|
||||
### END SOLUTION
|
||||
|
||||
def train(self):
|
||||
"""Set layer to training mode."""
|
||||
self.training = True
|
||||
return self
|
||||
|
||||
def eval(self):
|
||||
"""Set layer to evaluation mode."""
|
||||
self.training = False
|
||||
return self
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Forward pass through BatchNorm2d.
|
||||
|
||||
TODO: Implement batch normalization forward pass
|
||||
|
||||
APPROACH:
|
||||
1. Validate input shape (must be 4D: batch, channels, height, width)
|
||||
2. If training:
|
||||
a. Compute batch mean and variance per channel
|
||||
b. Normalize using batch statistics
|
||||
c. Update running statistics with momentum
|
||||
3. If eval:
|
||||
a. Use running mean and variance
|
||||
b. Normalize using frozen statistics
|
||||
4. Apply scale (gamma) and shift (beta)
|
||||
|
||||
EXAMPLE:
|
||||
>>> bn = BatchNorm2d(16)
|
||||
>>> x = Tensor(np.random.randn(2, 16, 8, 8)) # batch=2, channels=16, 8x8
|
||||
>>> y = bn(x)
|
||||
>>> print(y.shape) # (2, 16, 8, 8) - same shape
|
||||
|
||||
HINTS:
|
||||
- Compute mean/var over axes (0, 2, 3) to get per-channel statistics
|
||||
- Reshape gamma/beta to (1, C, 1, 1) for broadcasting
|
||||
- Running stat update: running = (1 - momentum) * running + momentum * batch
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Input validation
|
||||
if len(x.shape) != 4:
|
||||
raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
|
||||
|
||||
batch_size, channels, height, width = x.shape
|
||||
|
||||
if channels != self.num_features:
|
||||
raise ValueError(f"Expected {self.num_features} channels, got {channels}")
|
||||
|
||||
if self.training:
|
||||
# Compute batch statistics per channel
|
||||
# Mean over batch and spatial dimensions: axes (0, 2, 3)
|
||||
batch_mean = np.mean(x.data, axis=(0, 2, 3)) # Shape: (C,)
|
||||
batch_var = np.var(x.data, axis=(0, 2, 3)) # Shape: (C,)
|
||||
|
||||
# Update running statistics (exponential moving average)
|
||||
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
|
||||
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
|
||||
|
||||
# Use batch statistics for normalization
|
||||
mean = batch_mean
|
||||
var = batch_var
|
||||
else:
|
||||
# Use running statistics (frozen during eval)
|
||||
mean = self.running_mean
|
||||
var = self.running_var
|
||||
|
||||
# Normalize: (x - mean) / sqrt(var + eps)
|
||||
# Reshape mean and var for broadcasting: (C,) -> (1, C, 1, 1)
|
||||
mean_reshaped = mean.reshape(1, channels, 1, 1)
|
||||
var_reshaped = var.reshape(1, channels, 1, 1)
|
||||
|
||||
x_normalized = (x.data - mean_reshaped) / np.sqrt(var_reshaped + self.eps)
|
||||
|
||||
# Apply scale (gamma) and shift (beta)
|
||||
# Reshape for broadcasting: (C,) -> (1, C, 1, 1)
|
||||
gamma_reshaped = self.gamma.data.reshape(1, channels, 1, 1)
|
||||
beta_reshaped = self.beta.data.reshape(1, channels, 1, 1)
|
||||
|
||||
output = gamma_reshaped * x_normalized + beta_reshaped
|
||||
|
||||
# Return Tensor with gradient tracking
|
||||
result = Tensor(output, requires_grad=x.requires_grad or self.gamma.requires_grad)
|
||||
|
||||
return result
|
||||
### END SOLUTION
|
||||
|
||||
def parameters(self):
|
||||
"""Return learnable parameters (gamma and beta)."""
|
||||
return [self.gamma, self.beta]
|
||||
|
||||
def __call__(self, x):
|
||||
"""Enable model(x) syntax."""
|
||||
return self.forward(x)
|
||||
|
||||
# %% ../../modules/09_spatial/09_spatial.ipynb 25
|
||||
class SimpleCNN:
|
||||
"""
|
||||
Simple CNN demonstrating spatial operations integration.
|
||||
|
||||
3
tinytorch/core/tensor.py
generated
3
tinytorch/core/tensor.py
generated
@@ -146,8 +146,9 @@ class Tensor:
|
||||
new_shape[unknown_idx] = unknown_dim
|
||||
new_shape = tuple(new_shape)
|
||||
if np.prod(new_shape) != self.size:
|
||||
target_size = int(np.prod(new_shape))
|
||||
raise ValueError(
|
||||
f"Cannot reshape tensor of size {self.size} to shape {new_shape}"
|
||||
f"Total elements must match: {self.size} ≠ {target_size}"
|
||||
)
|
||||
reshaped_data = np.reshape(self.data, new_shape)
|
||||
result = Tensor(reshaped_data, requires_grad=self.requires_grad)
|
||||
|
||||
63
tinytorch/core/training.py
generated
63
tinytorch/core/training.py
generated
@@ -15,7 +15,7 @@
|
||||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['DEFAULT_MAX_LR', 'DEFAULT_MIN_LR', 'DEFAULT_TOTAL_EPOCHS', 'CosineSchedule', 'Trainer']
|
||||
__all__ = ['DEFAULT_MAX_LR', 'DEFAULT_MIN_LR', 'DEFAULT_TOTAL_EPOCHS', 'CosineSchedule', 'clip_grad_norm', 'Trainer']
|
||||
|
||||
# %% ../../modules/07_training/07_training.ipynb 1
|
||||
import numpy as np
|
||||
@@ -76,6 +76,67 @@ class CosineSchedule:
|
||||
return self.min_lr + (self.max_lr - self.min_lr) * cosine_factor
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/07_training/07_training.ipynb 10
|
||||
def clip_grad_norm(parameters: List, max_norm: float = 1.0) -> float:
|
||||
"""
|
||||
Clip gradients by global norm to prevent exploding gradients.
|
||||
|
||||
This is crucial for training stability, especially with RNNs and deep networks.
|
||||
Instead of clipping each gradient individually, we compute the global norm
|
||||
across all parameters and scale uniformly if needed.
|
||||
|
||||
TODO: Implement gradient clipping by global norm
|
||||
|
||||
APPROACH:
|
||||
1. Compute total norm: sqrt(sum of squared gradients across all parameters)
|
||||
2. If total_norm > max_norm, compute clip_coef = max_norm / total_norm
|
||||
3. Scale all gradients by clip_coef: grad *= clip_coef
|
||||
4. Return the original norm for monitoring
|
||||
|
||||
EXAMPLE:
|
||||
>>> params = [Tensor([1, 2, 3], requires_grad=True)]
|
||||
>>> params[0].grad = Tensor([10, 20, 30]) # Large gradients
|
||||
>>> original_norm = clip_grad_norm(params, max_norm=1.0)
|
||||
>>> print(f"Clipped norm: {np.linalg.norm(params[0].grad.data):.2f}") # Should be ≤ 1.0
|
||||
|
||||
HINTS:
|
||||
- Use np.linalg.norm() to compute norms
|
||||
- Only clip if total_norm > max_norm
|
||||
- Modify gradients in-place for efficiency
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if not parameters:
|
||||
return 0.0
|
||||
|
||||
# Collect all gradients and compute global norm
|
||||
total_norm = 0.0
|
||||
for param in parameters:
|
||||
if param.grad is not None:
|
||||
# Handle both Tensor gradients and numpy array gradients
|
||||
if isinstance(param.grad, np.ndarray):
|
||||
grad_data = param.grad
|
||||
else:
|
||||
# Trust that Tensor has .data attribute
|
||||
grad_data = param.grad.data
|
||||
total_norm += np.sum(grad_data ** 2)
|
||||
|
||||
total_norm = np.sqrt(total_norm)
|
||||
|
||||
# Clip if necessary
|
||||
if total_norm > max_norm:
|
||||
clip_coef = max_norm / total_norm
|
||||
for param in parameters:
|
||||
if param.grad is not None:
|
||||
# Handle both Tensor gradients and numpy array gradients
|
||||
if isinstance(param.grad, np.ndarray):
|
||||
param.grad = param.grad * clip_coef
|
||||
else:
|
||||
# Trust that Tensor has .data attribute
|
||||
param.grad.data = param.grad.data * clip_coef
|
||||
|
||||
return float(total_norm)
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/07_training/07_training.ipynb 14
|
||||
class Trainer:
|
||||
"""
|
||||
|
||||
209
tinytorch/data/loader.py
generated
209
tinytorch/data/loader.py
generated
@@ -15,7 +15,7 @@
|
||||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['Dataset', 'TensorDataset', 'DataLoader']
|
||||
__all__ = ['Dataset', 'TensorDataset', 'DataLoader', 'RandomHorizontalFlip', 'RandomCrop', 'Compose']
|
||||
|
||||
# %% ../../modules/08_dataloader/08_dataloader.ipynb 0
|
||||
#| default_exp data.loader
|
||||
@@ -262,3 +262,210 @@ class DataLoader:
|
||||
|
||||
return tuple(batched_tensors)
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/08_dataloader/08_dataloader.ipynb 12
|
||||
class RandomHorizontalFlip:
|
||||
"""
|
||||
Randomly flip images horizontally with given probability.
|
||||
|
||||
A simple but effective augmentation for most image datasets.
|
||||
Flipping is appropriate when horizontal orientation doesn't change class
|
||||
(cats, dogs, cars - not digits or text!).
|
||||
|
||||
Args:
|
||||
p: Probability of flipping (default: 0.5)
|
||||
"""
|
||||
|
||||
def __init__(self, p=0.5):
|
||||
"""
|
||||
Initialize RandomHorizontalFlip.
|
||||
|
||||
TODO: Store flip probability
|
||||
|
||||
EXAMPLE:
|
||||
>>> flip = RandomHorizontalFlip(p=0.5) # 50% chance to flip
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if not 0.0 <= p <= 1.0:
|
||||
raise ValueError(f"Probability must be between 0 and 1, got {p}")
|
||||
self.p = p
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, x):
|
||||
"""
|
||||
Apply random horizontal flip to input.
|
||||
|
||||
TODO: Implement random horizontal flip
|
||||
|
||||
APPROACH:
|
||||
1. Generate random number in [0, 1)
|
||||
2. If random < p, flip horizontally
|
||||
3. Otherwise, return unchanged
|
||||
|
||||
Args:
|
||||
x: Input array with shape (..., H, W) or (..., H, W, C)
|
||||
Flips along the last-1 axis (width dimension)
|
||||
|
||||
Returns:
|
||||
Flipped or unchanged array (same shape as input)
|
||||
|
||||
EXAMPLE:
|
||||
>>> flip = RandomHorizontalFlip(0.5)
|
||||
>>> img = np.array([[1, 2, 3], [4, 5, 6]]) # 2x3 image
|
||||
>>> # 50% chance output is [[3, 2, 1], [6, 5, 4]]
|
||||
|
||||
HINT: Use np.flip(x, axis=-1) to flip along width axis
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if np.random.random() < self.p:
|
||||
# Flip along the width axis (last axis for HW format, second-to-last for HWC)
|
||||
# Using axis=-1 works for both (..., H, W) and (..., H, W, C)
|
||||
if isinstance(x, Tensor):
|
||||
return Tensor(np.flip(x.data, axis=-1).copy())
|
||||
else:
|
||||
return np.flip(x, axis=-1).copy()
|
||||
return x
|
||||
### END SOLUTION
|
||||
|
||||
#| export
|
||||
|
||||
class RandomCrop:
|
||||
"""
|
||||
Randomly crop image after padding.
|
||||
|
||||
This is the standard augmentation for CIFAR-10:
|
||||
1. Pad image by `padding` pixels on each side
|
||||
2. Randomly crop back to original size
|
||||
|
||||
This simulates small translations in the image, forcing the model
|
||||
to recognize objects regardless of their exact position.
|
||||
|
||||
Args:
|
||||
size: Output crop size (int for square, or tuple (H, W))
|
||||
padding: Pixels to pad on each side before cropping (default: 4)
|
||||
"""
|
||||
|
||||
def __init__(self, size, padding=4):
|
||||
"""
|
||||
Initialize RandomCrop.
|
||||
|
||||
TODO: Store crop parameters
|
||||
|
||||
EXAMPLE:
|
||||
>>> crop = RandomCrop(32, padding=4) # CIFAR-10 standard
|
||||
>>> # Pads to 40x40, then crops back to 32x32
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(size, int):
|
||||
self.size = (size, size)
|
||||
else:
|
||||
self.size = size
|
||||
self.padding = padding
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, x):
|
||||
"""
|
||||
Apply random crop after padding.
|
||||
|
||||
TODO: Implement random crop with padding
|
||||
|
||||
APPROACH:
|
||||
1. Add zero-padding to all sides
|
||||
2. Choose random top-left corner for crop
|
||||
3. Extract crop of target size
|
||||
|
||||
Args:
|
||||
x: Input image with shape (C, H, W) or (H, W) or (H, W, C)
|
||||
Assumes spatial dimensions are H, W
|
||||
|
||||
Returns:
|
||||
Cropped image with target size
|
||||
|
||||
EXAMPLE:
|
||||
>>> crop = RandomCrop(32, padding=4)
|
||||
>>> img = np.random.randn(3, 32, 32) # CIFAR-10 format (C, H, W)
|
||||
>>> out = crop(img)
|
||||
>>> print(out.shape) # (3, 32, 32)
|
||||
|
||||
HINTS:
|
||||
- Use np.pad for adding zeros
|
||||
- Handle both (C, H, W) and (H, W) formats
|
||||
- Random offsets should be in [0, 2*padding]
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
is_tensor = isinstance(x, Tensor)
|
||||
data = x.data if is_tensor else x
|
||||
|
||||
target_h, target_w = self.size
|
||||
|
||||
# Determine image format and dimensions
|
||||
if len(data.shape) == 2:
|
||||
# (H, W) format
|
||||
h, w = data.shape
|
||||
padded = np.pad(data, self.padding, mode='constant', constant_values=0)
|
||||
|
||||
# Random crop position
|
||||
top = np.random.randint(0, 2 * self.padding + h - target_h + 1)
|
||||
left = np.random.randint(0, 2 * self.padding + w - target_w + 1)
|
||||
|
||||
cropped = padded[top:top + target_h, left:left + target_w]
|
||||
|
||||
elif len(data.shape) == 3:
|
||||
if data.shape[0] <= 4: # Likely (C, H, W) format
|
||||
c, h, w = data.shape
|
||||
# Pad only spatial dimensions
|
||||
padded = np.pad(data,
|
||||
((0, 0), (self.padding, self.padding), (self.padding, self.padding)),
|
||||
mode='constant', constant_values=0)
|
||||
|
||||
# Random crop position
|
||||
top = np.random.randint(0, 2 * self.padding + 1)
|
||||
left = np.random.randint(0, 2 * self.padding + 1)
|
||||
|
||||
cropped = padded[:, top:top + target_h, left:left + target_w]
|
||||
else: # Likely (H, W, C) format
|
||||
h, w, c = data.shape
|
||||
padded = np.pad(data,
|
||||
((self.padding, self.padding), (self.padding, self.padding), (0, 0)),
|
||||
mode='constant', constant_values=0)
|
||||
|
||||
top = np.random.randint(0, 2 * self.padding + 1)
|
||||
left = np.random.randint(0, 2 * self.padding + 1)
|
||||
|
||||
cropped = padded[top:top + target_h, left:left + target_w, :]
|
||||
else:
|
||||
raise ValueError(f"Expected 2D or 3D input, got shape {data.shape}")
|
||||
|
||||
return Tensor(cropped) if is_tensor else cropped
|
||||
### END SOLUTION
|
||||
|
||||
#| export
|
||||
|
||||
class Compose:
|
||||
"""
|
||||
Compose multiple transforms into a pipeline.
|
||||
|
||||
Applies transforms in sequence, passing output of each
|
||||
as input to the next.
|
||||
|
||||
Args:
|
||||
transforms: List of transform callables
|
||||
"""
|
||||
|
||||
def __init__(self, transforms):
|
||||
"""
|
||||
Initialize Compose with list of transforms.
|
||||
|
||||
EXAMPLE:
|
||||
>>> transforms = Compose([
|
||||
... RandomHorizontalFlip(0.5),
|
||||
... RandomCrop(32, padding=4)
|
||||
... ])
|
||||
"""
|
||||
self.transforms = transforms
|
||||
|
||||
def __call__(self, x):
|
||||
"""Apply all transforms in sequence."""
|
||||
for transform in self.transforms:
|
||||
x = transform(x)
|
||||
return x
|
||||
|
||||
6
tinytorch/generation/kv_cache.py
generated
6
tinytorch/generation/kv_cache.py
generated
@@ -15,7 +15,7 @@
|
||||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'KVCache', 'enable_kv_cache', 'disable_kv_cache']
|
||||
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'KVCache', 'create_kv_cache', 'enable_kv_cache', 'disable_kv_cache']
|
||||
|
||||
# %% ../../modules/17_memoization/17_memoization.ipynb 1
|
||||
import numpy as np
|
||||
@@ -303,11 +303,11 @@ class KVCache:
|
||||
}
|
||||
|
||||
# %% ../../modules/17_memoization/17_memoization.ipynb 11
|
||||
def enable_kv_cache(batch_size: int, max_seq_len: int, num_layers: int,
|
||||
def create_kv_cache(batch_size: int, max_seq_len: int, num_layers: int,
|
||||
num_heads: int, head_dim: int) -> KVCache:
|
||||
"""
|
||||
Create and return a KVCache instance for model generation.
|
||||
|
||||
|
||||
This function creates a properly sized cache for the model architecture.
|
||||
Call this before starting generation, then pass the cache to your
|
||||
generation loop.
|
||||
|
||||
202
tinytorch/optimization/acceleration.py
generated
202
tinytorch/optimization/acceleration.py
generated
@@ -15,8 +15,208 @@
|
||||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = []
|
||||
__all__ = ['vectorized_matmul', 'fused_gelu', 'tiled_matmul']
|
||||
|
||||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 0
|
||||
#| default_exp optimization.acceleration
|
||||
#| export
|
||||
|
||||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 7
|
||||
def vectorized_matmul(a: Tensor, b: Tensor) -> Tensor:
|
||||
"""
|
||||
High-performance matrix multiplication using vectorized operations.
|
||||
|
||||
This implementation leverages optimized BLAS libraries that use:
|
||||
- SIMD instructions for parallel computation
|
||||
- Cache-blocking for memory efficiency
|
||||
- Multi-threading for CPU parallelization
|
||||
|
||||
TODO: Implement production-grade matrix multiplication
|
||||
|
||||
APPROACH:
|
||||
1. Validate shapes are compatible for matrix multiplication
|
||||
2. Use NumPy's optimized dot product (calls BLAS GEMM)
|
||||
3. Return result wrapped in Tensor
|
||||
|
||||
Args:
|
||||
a: First tensor for multiplication (M×K or batch×M×K)
|
||||
b: Second tensor for multiplication (K×N or batch×K×N)
|
||||
|
||||
Returns:
|
||||
Result tensor of shape (M×N or batch×M×N)
|
||||
|
||||
EXAMPLE:
|
||||
Matrix multiplication visualization:
|
||||
>>> a = Tensor([[1, 2], [3, 4]]) # 2×2
|
||||
>>> b = Tensor([[5, 6], [7, 8]]) # 2×2
|
||||
>>> result = vectorized_matmul(a, b)
|
||||
>>> print(result.data)
|
||||
[[19 22] # [1×5+2×7, 1×6+2×8] = [19, 22]
|
||||
[43 50]] # [3×5+4×7, 3×6+4×8] = [43, 50]
|
||||
|
||||
PERFORMANCE CHARACTERISTICS:
|
||||
- Time Complexity: O(N³) but highly optimized
|
||||
- Space Complexity: O(N²) for result
|
||||
- Arithmetic Intensity: 2N³ FLOPs / 3N² bytes = 2N/3 (good for large N)
|
||||
|
||||
HINTS:
|
||||
- Check a.shape[-1] == b.shape[-2] for inner dimension match
|
||||
- Use np.matmul() for batch support and optimization
|
||||
- Trust BLAS to handle the vectorization magic
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Input validation for matrix multiplication
|
||||
if len(a.shape) < 2 or len(b.shape) < 2:
|
||||
raise ValueError(
|
||||
f"Matrix multiplication requires 2D+ tensors, got shapes {a.shape} and {b.shape}. "
|
||||
f"💡 HINT: Use reshape() to add dimensions if needed."
|
||||
)
|
||||
|
||||
if a.shape[-1] != b.shape[-2]:
|
||||
raise ValueError(
|
||||
f"Matrix multiplication shape mismatch: {a.shape} @ {b.shape}. "
|
||||
f"Inner dimensions must match: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}. "
|
||||
f"💡 HINT: For A@B, A's columns must equal B's rows."
|
||||
)
|
||||
|
||||
# Use NumPy's highly optimized matrix multiplication
|
||||
# This calls BLAS GEMM (General Matrix Multiply), which uses:
|
||||
# - SIMD vectorization for parallel arithmetic
|
||||
# - Cache blocking for memory efficiency
|
||||
# - Multi-threading on multi-core systems
|
||||
result_data = np.matmul(a.data, b.data)
|
||||
|
||||
return Tensor(result_data)
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 10
|
||||
def fused_gelu(x: Tensor) -> Tensor:
|
||||
"""
|
||||
Fused GELU activation that combines all operations in a single kernel.
|
||||
|
||||
GELU combines the benefits of ReLU and sigmoid:
|
||||
- Smooth everywhere (unlike ReLU's discontinuity at 0)
|
||||
- Non-saturating for positive values (unlike sigmoid)
|
||||
- Probabilistic interpretation: x * P(X ≤ x) where X ~ N(0,1)
|
||||
|
||||
Mathematical Definition:
|
||||
GELU(x) = x * Φ(x) where Φ(x) is the standard normal CDF
|
||||
|
||||
Fast Approximation (used here):
|
||||
GELU(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
|
||||
|
||||
TODO: Implement fused GELU to minimize memory bandwidth
|
||||
|
||||
APPROACH:
|
||||
1. Compute all intermediate values in a single expression
|
||||
2. Avoid creating temporary arrays
|
||||
3. Let NumPy's broadcasting handle vectorization
|
||||
|
||||
Args:
|
||||
x: Input tensor to apply GELU activation
|
||||
|
||||
Returns:
|
||||
GELU-activated tensor (same shape as input)
|
||||
|
||||
EXAMPLE:
|
||||
>>> x = Tensor([-2, -1, 0, 1, 2])
|
||||
>>> result = fused_gelu(x)
|
||||
>>> print(result.data)
|
||||
[-0.04550026 -0.15865526 0. 0.8413447 1.9544997 ]
|
||||
# Notice: smooth transition through 0, positive bias
|
||||
|
||||
MEMORY EFFICIENCY:
|
||||
- Unfused: 5 temporary arrays × input_size × 4 bytes
|
||||
- Fused: 0 temporary arrays, direct computation
|
||||
- Bandwidth reduction: ~80% for memory-bound operations
|
||||
|
||||
HINTS:
|
||||
- Use np.sqrt(2.0 / np.pi) for the constant
|
||||
- Keep entire expression in one line for maximum fusion
|
||||
- NumPy will optimize the expression tree automatically
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Mathematical constant for GELU approximation
|
||||
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
|
||||
|
||||
# Fused GELU computation - all operations in single expression
|
||||
# This minimizes memory bandwidth by avoiding intermediate arrays
|
||||
# NumPy's expression evaluator will optimize this into efficient machine code
|
||||
result_data = 0.5 * x.data * (
|
||||
1.0 + np.tanh(sqrt_2_over_pi * (x.data + 0.044715 * x.data**3))
|
||||
)
|
||||
|
||||
return Tensor(result_data)
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/18_acceleration/18_acceleration.ipynb 16
|
||||
def tiled_matmul(a: Tensor, b: Tensor, tile_size: int = 64) -> Tensor:
|
||||
"""
|
||||
Cache-aware matrix multiplication using tiling/blocking.
|
||||
|
||||
Demonstrates blocking algorithm for cache optimization by breaking
|
||||
large matrix multiplications into cache-sized chunks.
|
||||
|
||||
TODO: Implement cache-aware tiled matrix multiplication
|
||||
|
||||
APPROACH:
|
||||
1. Validate inputs for matrix multiplication compatibility
|
||||
2. Use NumPy's optimized matmul (which already implements tiling internally)
|
||||
3. In production, explicit tiling would use nested loops over blocks
|
||||
|
||||
Args:
|
||||
a: First matrix (M×K)
|
||||
b: Second matrix (K×N)
|
||||
tile_size: Block size for cache efficiency (default: 64)
|
||||
|
||||
Returns:
|
||||
Result matrix (M×N)
|
||||
|
||||
EXAMPLE:
|
||||
>>> a = Tensor(np.random.randn(256, 256))
|
||||
>>> b = Tensor(np.random.randn(256, 256))
|
||||
>>> result = tiled_matmul(a, b, tile_size=64)
|
||||
>>> # Same result as vectorized_matmul, but more cache-friendly for large matrices
|
||||
|
||||
PERFORMANCE CHARACTERISTICS:
|
||||
- Reduces cache misses by working on blocks that fit in L1/L2
|
||||
- Especially beneficial for matrices larger than cache size
|
||||
- tile_size should match cache line size (typically 64 bytes)
|
||||
|
||||
HINTS:
|
||||
- For educational purposes, we use NumPy's optimized BLAS
|
||||
- BLAS libraries (MKL, OpenBLAS) already implement cache blocking
|
||||
- Explicit tiling would use 6 nested loops (3 for tiles, 3 for elements)
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Input validation
|
||||
if len(a.shape) < 2 or len(b.shape) < 2:
|
||||
raise ValueError(
|
||||
f"Tiled matmul requires 2D+ tensors, got shapes {a.shape} and {b.shape}. "
|
||||
f"💡 HINT: Tiling works on matrix operations."
|
||||
)
|
||||
|
||||
if a.shape[-1] != b.shape[-2]:
|
||||
raise ValueError(
|
||||
f"Shape mismatch: {a.shape} @ {b.shape}. "
|
||||
f"Inner dimensions must match for matrix multiplication. "
|
||||
f"💡 HINT: a.shape[-1]={a.shape[-1]} != b.shape[-2]={b.shape[-2]}"
|
||||
)
|
||||
|
||||
# For educational purposes, we use NumPy's matmul which already
|
||||
# implements cache-aware tiling via BLAS libraries (MKL, OpenBLAS)
|
||||
# These libraries automatically partition large matrices into
|
||||
# cache-sized blocks for optimal performance
|
||||
|
||||
# In a full educational implementation, you would write:
|
||||
# for i_tile in range(0, M, tile_size):
|
||||
# for j_tile in range(0, N, tile_size):
|
||||
# for k_tile in range(0, K, tile_size):
|
||||
# # Multiply tile blocks that fit in cache
|
||||
# C[i_tile:i_tile+tile_size, j_tile:j_tile+tile_size] +=
|
||||
# A[i_tile:i_tile+tile_size, k_tile:k_tile+tile_size] @
|
||||
# B[k_tile:k_tile+tile_size, j_tile:j_tile+tile_size]
|
||||
|
||||
result_data = np.matmul(a.data, b.data)
|
||||
return Tensor(result_data)
|
||||
### END SOLUTION
|
||||
|
||||
46
tinytorch/optimization/compression.py
generated
46
tinytorch/optimization/compression.py
generated
@@ -15,8 +15,8 @@
|
||||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'magnitude_prune', 'structured_prune', 'KnowledgeDistillation',
|
||||
'CompressionComplete', 'measure_sparsity', 'compress_model']
|
||||
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'magnitude_prune', 'structured_prune', 'low_rank_approximate',
|
||||
'KnowledgeDistillation', 'CompressionComplete', 'measure_sparsity', 'compress_model']
|
||||
|
||||
# %% ../../modules/16_compression/16_compression.ipynb 1
|
||||
import numpy as np
|
||||
@@ -145,6 +145,48 @@ def structured_prune(model, prune_ratio=0.5):
|
||||
return model
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/16_compression/16_compression.ipynb 18
|
||||
def low_rank_approximate(weight_matrix, rank_ratio=0.5):
|
||||
"""
|
||||
Approximate weight matrix using low-rank decomposition (SVD).
|
||||
|
||||
TODO: Implement SVD-based low-rank approximation
|
||||
|
||||
APPROACH:
|
||||
1. Perform SVD: W = U @ S @ V^T
|
||||
2. Keep only top k singular values where k = rank_ratio * min(dimensions)
|
||||
3. Reconstruct: W_approx = U[:,:k] @ diag(S[:k]) @ V[:k,:]
|
||||
4. Return decomposed matrices for memory savings
|
||||
|
||||
EXAMPLE:
|
||||
>>> weight = np.random.randn(100, 50)
|
||||
>>> U, S, V = low_rank_approximate(weight, rank_ratio=0.3)
|
||||
>>> # Original: 100*50 = 5000 params
|
||||
>>> # Compressed: 100*15 + 15*50 = 2250 params (55% reduction)
|
||||
|
||||
HINTS:
|
||||
- Use np.linalg.svd() for decomposition
|
||||
- Choose k = int(rank_ratio * min(m, n))
|
||||
- Return U[:,:k], S[:k], V[:k,:] for reconstruction
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
m, n = weight_matrix.shape
|
||||
|
||||
# Perform SVD
|
||||
U, S, V = np.linalg.svd(weight_matrix, full_matrices=False)
|
||||
|
||||
# Determine target rank
|
||||
max_rank = min(m, n)
|
||||
target_rank = max(1, int(rank_ratio * max_rank))
|
||||
|
||||
# Truncate to target rank
|
||||
U_truncated = U[:, :target_rank]
|
||||
S_truncated = S[:target_rank]
|
||||
V_truncated = V[:target_rank, :]
|
||||
|
||||
return U_truncated, S_truncated, V_truncated
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/16_compression/16_compression.ipynb 21
|
||||
class KnowledgeDistillation:
|
||||
"""
|
||||
|
||||
82
tinytorch/optimization/quantization.py
generated
82
tinytorch/optimization/quantization.py
generated
@@ -16,8 +16,8 @@
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['INT8_MIN_VALUE', 'INT8_MAX_VALUE', 'INT8_RANGE', 'EPSILON', 'BYTES_PER_FLOAT32', 'BYTES_PER_INT8', 'MB_TO_BYTES',
|
||||
'SimpleModel', 'QuantizedLinear', 'QuantizationComplete', 'quantize_int8', 'dequantize_int8',
|
||||
'quantize_model']
|
||||
'SimpleModel', 'QuantizedLinear', 'compare_model_sizes', 'QuantizationComplete', 'quantize_int8',
|
||||
'dequantize_int8', 'quantize_model']
|
||||
|
||||
# %% ../../modules/15_quantization/15_quantization.ipynb 3
|
||||
import numpy as np
|
||||
@@ -198,6 +198,84 @@ class QuantizedLinear:
|
||||
}
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/15_quantization/15_quantization.ipynb 24
|
||||
def compare_model_sizes(original_model, quantized_model) -> Dict[str, float]:
|
||||
"""
|
||||
Compare memory usage between original and quantized models.
|
||||
|
||||
TODO: Calculate comprehensive memory comparison
|
||||
|
||||
APPROACH:
|
||||
1. Count parameters in both models
|
||||
2. Calculate bytes used (FP32 vs INT8)
|
||||
3. Include quantization overhead
|
||||
4. Return comparison metrics
|
||||
|
||||
Args:
|
||||
original_model: Model before quantization
|
||||
quantized_model: Model after quantization
|
||||
|
||||
Returns:
|
||||
Dictionary with 'original_mb', 'quantized_mb', 'reduction_ratio', 'memory_saved_mb'
|
||||
|
||||
EXAMPLE:
|
||||
>>> layer1 = Linear(100, 50)
|
||||
>>> layer2 = Linear(50, 10)
|
||||
>>> model = SimpleModel(layer1, layer2)
|
||||
>>> quantize_model(model)
|
||||
>>> stats = compare_model_sizes(model, model) # Same model after in-place quantization
|
||||
>>> print(f"Reduced to {stats['reduction_ratio']:.1f}x smaller")
|
||||
Reduced to 4.0x smaller
|
||||
|
||||
HINTS:
|
||||
- FP32 uses 4 bytes per parameter, INT8 uses 1 byte
|
||||
- Include scale/zero_point overhead (2 values per quantized layer)
|
||||
- Expected ratio: ~4x for INT8 quantization
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Count original model parameters
|
||||
# SimpleModel has .layers attribute, layers may have .parameters() method
|
||||
original_params = 0
|
||||
original_bytes = 0
|
||||
for layer in original_model.layers:
|
||||
if hasattr(layer, 'parameters'):
|
||||
params = layer.parameters()
|
||||
for param in params:
|
||||
original_params += param.data.size
|
||||
original_bytes += param.data.size * BYTES_PER_FLOAT32
|
||||
|
||||
# Count quantized model parameters
|
||||
quantized_params = 0
|
||||
quantized_bytes = 0
|
||||
for layer in quantized_model.layers:
|
||||
if isinstance(layer, QuantizedLinear):
|
||||
memory_info = layer.memory_usage()
|
||||
quantized_bytes += memory_info['quantized_bytes']
|
||||
params = layer.parameters()
|
||||
for param in params:
|
||||
quantized_params += param.data.size
|
||||
else:
|
||||
# Non-quantized layers - may have .parameters() method
|
||||
if hasattr(layer, 'parameters'):
|
||||
params = layer.parameters()
|
||||
for param in params:
|
||||
quantized_params += param.data.size
|
||||
quantized_bytes += param.data.size * BYTES_PER_FLOAT32
|
||||
|
||||
compression_ratio = original_bytes / quantized_bytes if quantized_bytes > 0 else 1.0
|
||||
memory_saved = original_bytes - quantized_bytes
|
||||
|
||||
return {
|
||||
'original_params': original_params,
|
||||
'quantized_params': quantized_params,
|
||||
'original_bytes': original_bytes,
|
||||
'quantized_bytes': quantized_bytes,
|
||||
'compression_ratio': compression_ratio,
|
||||
'memory_saved_mb': memory_saved / MB_TO_BYTES,
|
||||
'memory_saved_percent': (memory_saved / original_bytes) * 100 if original_bytes > 0 else 0
|
||||
}
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/15_quantization/15_quantization.ipynb 36
|
||||
class QuantizationComplete:
|
||||
"""
|
||||
|
||||
64
tinytorch/text/embeddings.py
generated
64
tinytorch/text/embeddings.py
generated
@@ -15,7 +15,8 @@
|
||||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'Embedding', 'PositionalEncoding', 'EmbeddingLayer']
|
||||
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'Embedding', 'PositionalEncoding', 'create_sinusoidal_embeddings',
|
||||
'EmbeddingLayer']
|
||||
|
||||
# %% ../../modules/11_embeddings/11_embeddings.ipynb 2
|
||||
import numpy as np
|
||||
@@ -226,6 +227,67 @@ class PositionalEncoding:
|
||||
return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})"
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/11_embeddings/11_embeddings.ipynb 14
|
||||
def create_sinusoidal_embeddings(max_seq_len: int, embed_dim: int) -> Tensor:
|
||||
"""
|
||||
Create sinusoidal positional encodings as used in "Attention Is All You Need".
|
||||
|
||||
These fixed encodings use sine and cosine functions to create unique
|
||||
positional patterns that don't require training and can extrapolate
|
||||
to longer sequences than seen during training.
|
||||
|
||||
TODO: Implement sinusoidal positional encoding generation
|
||||
|
||||
APPROACH:
|
||||
1. Create position indices: [0, 1, 2, ..., max_seq_len-1]
|
||||
2. Create dimension indices for frequency calculation
|
||||
3. Apply sine to even dimensions, cosine to odd dimensions
|
||||
4. Use the transformer paper formula with 10000 base
|
||||
|
||||
MATHEMATICAL FORMULA:
|
||||
PE(pos, 2i) = sin(pos / 10000^(2i/embed_dim))
|
||||
PE(pos, 2i+1) = cos(pos / 10000^(2i/embed_dim))
|
||||
|
||||
EXAMPLE:
|
||||
>>> pe = create_sinusoidal_embeddings(512, 64)
|
||||
>>> print(pe.shape)
|
||||
(512, 64)
|
||||
>>> # Position 0: [0, 1, 0, 1, 0, 1, ...] (sin(0)=0, cos(0)=1)
|
||||
>>> # Each position gets unique trigonometric signature
|
||||
|
||||
HINTS:
|
||||
- Use np.arange to create position and dimension arrays
|
||||
- Calculate div_term using exponential for frequency scaling
|
||||
- Apply different formulas to even/odd dimensions
|
||||
- The 10000 base creates different frequencies for different dimensions
|
||||
"""
|
||||
|
||||
### BEGIN SOLUTION
|
||||
# Create position indices [0, 1, 2, ..., max_seq_len-1]
|
||||
position = np.arange(max_seq_len, dtype=np.float32)[:, np.newaxis] # (max_seq_len, 1)
|
||||
|
||||
# Create dimension indices for calculating frequencies
|
||||
div_term = np.exp(
|
||||
np.arange(0, embed_dim, 2, dtype=np.float32) *
|
||||
-(math.log(10000.0) / embed_dim)
|
||||
) # (embed_dim//2,)
|
||||
|
||||
# Initialize the positional encoding matrix
|
||||
pe = np.zeros((max_seq_len, embed_dim), dtype=np.float32)
|
||||
|
||||
# Apply sine to even indices (0, 2, 4, ...)
|
||||
pe[:, 0::2] = np.sin(position * div_term)
|
||||
|
||||
# Apply cosine to odd indices (1, 3, 5, ...)
|
||||
if embed_dim % 2 == 1:
|
||||
# Handle odd embed_dim by only filling available positions
|
||||
pe[:, 1::2] = np.cos(position * div_term[:-1])
|
||||
else:
|
||||
pe[:, 1::2] = np.cos(position * div_term)
|
||||
|
||||
return Tensor(pe)
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/11_embeddings/11_embeddings.ipynb 18
|
||||
class EmbeddingLayer:
|
||||
"""
|
||||
|
||||
@@ -14,6 +14,7 @@ from .src import SrcCommand
|
||||
from .nbgrader import NBGraderCommand
|
||||
from .benchmark import BenchmarkCommand
|
||||
from .community import CommunityCommand
|
||||
from .verify import VerifyCommand
|
||||
|
||||
# Command groups (with subcommands organized in subfolders)
|
||||
from .system import SystemCommand
|
||||
@@ -29,6 +30,7 @@ __all__ = [
|
||||
'NBGraderCommand',
|
||||
'BenchmarkCommand',
|
||||
'CommunityCommand',
|
||||
'VerifyCommand',
|
||||
# Command groups
|
||||
'SystemCommand',
|
||||
'ModuleWorkflowCommand',
|
||||
|
||||
232
tito/commands/verify.py
Normal file
232
tito/commands/verify.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
TinyTorch Verify Command
|
||||
|
||||
Checks that the environment is set up correctly and ready to use.
|
||||
On success, prompts to join the community map.
|
||||
|
||||
This is essentially `tito system health` + package import check + postcard.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import webbrowser
|
||||
from argparse import ArgumentParser, Namespace
|
||||
from pathlib import Path
|
||||
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
from .base import BaseCommand
|
||||
|
||||
|
||||
class VerifyCommand(BaseCommand):
|
||||
"""Verify TinyTorch setup is ready, then join the community."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "verify"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Verify setup is ready, then join the community map"
|
||||
|
||||
def add_arguments(self, parser: ArgumentParser) -> None:
|
||||
parser.add_argument(
|
||||
"--skip-registration",
|
||||
action="store_true",
|
||||
help="Skip registration prompt after verification"
|
||||
)
|
||||
|
||||
def run(self, args: Namespace) -> int:
|
||||
"""Run verification checks and prompt for registration."""
|
||||
|
||||
self.console.print()
|
||||
self.console.print(Panel.fit(
|
||||
"[bold cyan]🔬 Verifying TinyTorch Setup[/bold cyan]",
|
||||
border_style="cyan"
|
||||
))
|
||||
self.console.print()
|
||||
|
||||
all_passed = True
|
||||
|
||||
# 1. Environment checks
|
||||
all_passed &= self._check_environment()
|
||||
|
||||
# 2. Project structure checks
|
||||
all_passed &= self._check_structure()
|
||||
|
||||
# 3. Package import checks
|
||||
all_passed &= self._check_package()
|
||||
|
||||
# Result
|
||||
self.console.print()
|
||||
if all_passed:
|
||||
self._show_success()
|
||||
if not args.skip_registration:
|
||||
self._prompt_registration()
|
||||
return 0
|
||||
else:
|
||||
self._show_failure()
|
||||
return 1
|
||||
|
||||
def _check_environment(self) -> bool:
|
||||
"""Check Python environment and dependencies."""
|
||||
self.console.print("[bold]Environment[/bold]")
|
||||
|
||||
all_ok = True
|
||||
|
||||
# Python
|
||||
self.console.print(f" [green]✓[/green] Python {sys.version.split()[0]}")
|
||||
|
||||
# Virtual environment
|
||||
venv_exists = self.venv_path.exists()
|
||||
in_venv = (
|
||||
os.environ.get('VIRTUAL_ENV') is not None or
|
||||
(hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix) or
|
||||
hasattr(sys, 'real_prefix')
|
||||
)
|
||||
|
||||
if venv_exists and in_venv:
|
||||
self.console.print(" [green]✓[/green] Virtual environment active")
|
||||
elif venv_exists:
|
||||
self.console.print(" [yellow]![/yellow] Virtual environment exists but not active")
|
||||
self.console.print(" [dim]Run: source activate.sh[/dim]")
|
||||
else:
|
||||
self.console.print(" [yellow]![/yellow] No virtual environment")
|
||||
|
||||
# Required dependencies
|
||||
required = [
|
||||
('numpy', 'NumPy'),
|
||||
('rich', 'Rich'),
|
||||
('yaml', 'PyYAML'),
|
||||
]
|
||||
|
||||
for module, name in required:
|
||||
try:
|
||||
__import__(module)
|
||||
self.console.print(f" [green]✓[/green] {name}")
|
||||
except ImportError:
|
||||
self.console.print(f" [red]✗[/red] {name} [dim](pip install {module})[/dim]")
|
||||
all_ok = False
|
||||
|
||||
self.console.print()
|
||||
return all_ok
|
||||
|
||||
def _check_structure(self) -> bool:
|
||||
"""Check project structure exists."""
|
||||
self.console.print("[bold]Project Structure[/bold]")
|
||||
|
||||
all_ok = True
|
||||
|
||||
paths = [
|
||||
('tinytorch/', 'Package'),
|
||||
('tinytorch/core/', 'Core modules'),
|
||||
('src/', 'Source modules'),
|
||||
]
|
||||
|
||||
for path, desc in paths:
|
||||
if Path(path).exists():
|
||||
self.console.print(f" [green]✓[/green] {path}")
|
||||
else:
|
||||
self.console.print(f" [red]✗[/red] {path} [dim]({desc})[/dim]")
|
||||
all_ok = False
|
||||
|
||||
self.console.print()
|
||||
return all_ok
|
||||
|
||||
def _check_package(self) -> bool:
|
||||
"""Check that tinytorch package is importable."""
|
||||
self.console.print("[bold]Package[/bold]")
|
||||
|
||||
all_ok = True
|
||||
|
||||
# Import tinytorch
|
||||
try:
|
||||
import tinytorch
|
||||
self.console.print(" [green]✓[/green] import tinytorch")
|
||||
except ImportError as e:
|
||||
self.console.print(f" [red]✗[/red] import tinytorch")
|
||||
self.console.print(f" [dim red]{e}[/dim red]")
|
||||
return False
|
||||
|
||||
# Check core components
|
||||
try:
|
||||
from tinytorch import Tensor
|
||||
self.console.print(" [green]✓[/green] Tensor available")
|
||||
except ImportError:
|
||||
self.console.print(" [red]✗[/red] Tensor not available")
|
||||
all_ok = False
|
||||
|
||||
try:
|
||||
from tinytorch import Linear, ReLU
|
||||
self.console.print(" [green]✓[/green] Layers available")
|
||||
except ImportError:
|
||||
self.console.print(" [red]✗[/red] Layers not available")
|
||||
all_ok = False
|
||||
|
||||
try:
|
||||
from tinytorch import SGD
|
||||
self.console.print(" [green]✓[/green] Optimizer available")
|
||||
except ImportError:
|
||||
self.console.print(" [red]✗[/red] Optimizer not available")
|
||||
all_ok = False
|
||||
|
||||
return all_ok
|
||||
|
||||
def _show_success(self) -> None:
|
||||
"""Show success message."""
|
||||
self.console.print(Panel.fit(
|
||||
"[bold green]✅ TinyTorch is ready![/bold green]\n\n"
|
||||
"Your environment is set up correctly.\n"
|
||||
"You can start working on modules.",
|
||||
border_style="green",
|
||||
box=box.ROUNDED
|
||||
))
|
||||
|
||||
def _show_failure(self) -> None:
|
||||
"""Show failure message."""
|
||||
self.console.print(Panel.fit(
|
||||
"[bold red]❌ Setup incomplete[/bold red]\n\n"
|
||||
"Some checks failed. See above for details.\n\n"
|
||||
"[dim]Run 'tito setup' to fix common issues[/dim]",
|
||||
border_style="red",
|
||||
box=box.ROUNDED
|
||||
))
|
||||
|
||||
def _prompt_registration(self) -> None:
|
||||
"""Prompt user to join the community."""
|
||||
from rich.prompt import Confirm
|
||||
|
||||
self.console.print()
|
||||
self.console.print(Panel.fit(
|
||||
"[bold cyan]🌍 Join the TinyTorch Community[/bold cyan]\n\n"
|
||||
"Add yourself to the map at [link=https://tinytorch.ai/map]tinytorch.ai/map[/link]\n\n"
|
||||
"[dim]• See learners worldwide\n"
|
||||
"• Country & institution (optional)\n"
|
||||
"• No account required[/dim]",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
join = Confirm.ask("\n[bold]Join the community?[/bold]", default=True)
|
||||
|
||||
if join:
|
||||
self._open_registration()
|
||||
else:
|
||||
self.console.print("[dim]No problem! Run 'tito verify' anytime to join later.[/dim]")
|
||||
|
||||
def _open_registration(self) -> None:
|
||||
"""Open registration page."""
|
||||
url = "https://tinytorch.ai/join"
|
||||
|
||||
self.console.print(f"\n[cyan]Opening registration...[/cyan]")
|
||||
|
||||
try:
|
||||
webbrowser.open(url)
|
||||
self.console.print(f"[green]✓[/green] Browser opened")
|
||||
self.console.print(f"[dim] {url}[/dim]")
|
||||
except Exception:
|
||||
self.console.print(f"[yellow]Could not open browser.[/yellow]")
|
||||
self.console.print(f"Please visit: [cyan]{url}[/cyan]")
|
||||
|
||||
self.console.print("\n[green]Welcome to the community! 🎉[/green]")
|
||||
@@ -38,6 +38,7 @@ from .commands.milestone import MilestoneCommand
|
||||
from .commands.setup import SetupCommand
|
||||
from .commands.benchmark import BenchmarkCommand
|
||||
from .commands.community import CommunityCommand
|
||||
from .commands.verify import VerifyCommand
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@@ -79,6 +80,8 @@ class TinyTorchCLI:
|
||||
'test': TestCommand,
|
||||
'grade': GradeCommand,
|
||||
'logo': LogoCommand,
|
||||
# Verification
|
||||
'verify': VerifyCommand,
|
||||
}
|
||||
|
||||
# Command categorization for help display
|
||||
@@ -91,6 +94,7 @@ class TinyTorchCLI:
|
||||
('[green]tito setup[/green]', 'First-time setup'),
|
||||
('[green]tito module start 01[/green]', 'Start Module 01 (tensors)'),
|
||||
('[green]tito module complete 01[/green]', 'Test, export, and track progress'),
|
||||
('[green]tito verify[/green]', 'Verify installation and join community'),
|
||||
],
|
||||
'track_progress': [
|
||||
('[yellow]tito module status[/yellow]', 'View module progress'),
|
||||
|
||||
Reference in New Issue
Block a user