mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-30 22:07:37 -05:00
This commit implements the pedagogically optimal "inevitable discovery" module progression based on expert validation and educational design principles. ## Module Reordering Summary **Previous Order (Problems)**: - 05_losses → 06_autograd → 07_dataloader → 08_optimizers → 09_spatial → 10_training - Issues: Autograd before optimizers, DataLoader before training, scattered dependencies **New Order (Beautiful Progression)**: - 05_losses → 06_optimizers → 07_autograd → 08_training → 09_spatial → 10_dataloader - Benefits: Each module creates inevitable need for the next ## Pedagogical Flow Achieved **05_losses** → "Need systematic weight updates" → **06_optimizers** **06_optimizers** → "Need automatic gradients" → **07_autograd** **07_autograd** → "Need systematic training" → **08_training** **08_training** → "MLPs hit limits on images" → **09_spatial** **09_spatial** → "Training is too slow" → **10_dataloader** ## Technical Changes ### Module Directory Renaming - `06_autograd` → `07_autograd` - `07_dataloader` → `10_dataloader` - `08_optimizers` → `06_optimizers` - `10_training` → `08_training` - `09_spatial` → `09_spatial` (no change) ### System Integration Updates - **MODULE_TO_CHECKPOINT mapping**: Updated in tito/commands/export.py - **Test directories**: Renamed module_XX directories to match new numbers - **Documentation**: Updated all references in MD files and agent configurations - **CLI integration**: Updated next-steps suggestions for proper flow ### Agent Configuration Updates - **Quality Assurance**: Updated module audit status with new numbers - **Module Developer**: Updated work tracking with new sequence - **Documentation**: Updated MASTER_PLAN_OF_RECORD.md with beautiful progression ## Educational Benefits 1. **Inevitable Discovery**: Each module naturally leads to the next 2. **Cognitive Load**: Concepts introduced exactly when needed 3. **Motivation**: Students understand WHY each tool is necessary 4. **Synthesis**: Everything flows toward complete ML systems understanding 5. **Professional Alignment**: Matches real ML engineering workflows ## Quality Assurance - ✅ All CLI commands still function - ✅ Checkpoint system mappings updated - ✅ Documentation consistency maintained - ✅ Test directory structure aligned - ✅ Agent configurations synchronized **Impact**: This reordering transforms TinyTorch from a collection of modules into a coherent educational journey where each step naturally motivates the next, creating optimal conditions for deep learning systems understanding.
390 lines
18 KiB
Python
390 lines
18 KiB
Python
"""
|
|
Integration Tests - DataLoader and Tensor
|
|
|
|
Tests real integration between DataLoader and Tensor modules.
|
|
Uses actual TinyTorch components to verify they work together correctly.
|
|
"""
|
|
|
|
import pytest
|
|
import numpy as np
|
|
from test_utils import setup_integration_test
|
|
|
|
# Ensure proper setup before importing
|
|
setup_integration_test()
|
|
|
|
# Import ONLY from TinyTorch package
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.dataloader import DataLoader, Dataset, SimpleDataset
|
|
from tinytorch.core.activations import ReLU
|
|
from tinytorch.core.layers import Dense
|
|
|
|
|
|
class TestDataLoaderTensorIntegration:
|
|
"""Test integration between DataLoader and Tensor components."""
|
|
|
|
def test_simple_dataset_produces_tensors(self):
|
|
"""Test SimpleDataset produces real Tensor objects."""
|
|
# Create SimpleDataset
|
|
dataset = SimpleDataset(size=10, num_features=3, num_classes=2)
|
|
|
|
# Get a sample
|
|
data, label = dataset[0]
|
|
|
|
# Verify outputs are tensors
|
|
assert isinstance(data, Tensor), "Data should be a Tensor"
|
|
assert isinstance(label, Tensor), "Label should be a Tensor"
|
|
|
|
# Verify tensor properties
|
|
assert data.shape == (3,), f"Expected data shape (3,), got {data.shape}"
|
|
assert label.shape == (), f"Expected label shape (), got {label.shape}"
|
|
assert data.dtype == np.float32, f"Expected float32, got {data.dtype}"
|
|
assert label.dtype == np.int32, f"Expected int32, got {label.dtype}"
|
|
|
|
def test_dataloader_produces_tensor_batches(self):
|
|
"""Test DataLoader produces batches of real Tensor objects."""
|
|
# Create dataset and dataloader
|
|
dataset = SimpleDataset(size=20, num_features=4, num_classes=3)
|
|
dataloader = DataLoader(dataset, batch_size=5, shuffle=False)
|
|
|
|
# Get first batch
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Verify batch outputs are tensors
|
|
assert isinstance(batch_data, Tensor), "Batch data should be a Tensor"
|
|
assert isinstance(batch_labels, Tensor), "Batch labels should be a Tensor"
|
|
|
|
# Verify batch shapes
|
|
assert batch_data.shape == (5, 4), f"Expected batch data shape (5, 4), got {batch_data.shape}"
|
|
assert batch_labels.shape == (5,), f"Expected batch labels shape (5,), got {batch_labels.shape}"
|
|
|
|
# Verify data types
|
|
assert batch_data.dtype == np.float32, f"Expected float32, got {batch_data.dtype}"
|
|
assert batch_labels.dtype == np.int32, f"Expected int32, got {batch_labels.dtype}"
|
|
|
|
def test_dataloader_tensor_compatibility_with_activations(self):
|
|
"""Test DataLoader tensors work with activation functions."""
|
|
# Create dataset and dataloader
|
|
dataset = SimpleDataset(size=10, num_features=3, num_classes=2)
|
|
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
|
|
|
|
# Get batch
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Apply activation function
|
|
relu = ReLU()
|
|
activated_data = relu(batch_data)
|
|
|
|
# Verify result is tensor
|
|
assert isinstance(activated_data, Tensor), "Activated data should be a Tensor"
|
|
assert activated_data.shape == batch_data.shape, "Shape should be preserved"
|
|
|
|
# Verify ReLU applied correctly (non-negative values)
|
|
assert np.all(activated_data.data >= 0), "ReLU should produce non-negative values"
|
|
|
|
def test_dataloader_tensor_compatibility_with_layers(self):
|
|
"""Test DataLoader tensors work with neural network layers."""
|
|
# Create dataset and dataloader
|
|
dataset = SimpleDataset(size=10, num_features=3, num_classes=2)
|
|
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
|
|
|
|
# Get batch
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Apply dense layer
|
|
dense = Dense(input_size=3, output_size=2)
|
|
output = dense(batch_data)
|
|
|
|
# Verify result is tensor
|
|
assert isinstance(output, Tensor), "Layer output should be a Tensor"
|
|
assert output.shape == (4, 2), f"Expected output shape (4, 2), got {output.shape}"
|
|
assert output.dtype == np.float32, f"Expected float32, got {output.dtype}"
|
|
|
|
def test_dataloader_full_pipeline_integration(self):
|
|
"""Test DataLoader tensors in complete ML pipeline."""
|
|
# Create dataset and dataloader
|
|
dataset = SimpleDataset(size=12, num_features=4, num_classes=3)
|
|
dataloader = DataLoader(dataset, batch_size=6, shuffle=False)
|
|
|
|
# Get batch
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Apply full pipeline: Dense → ReLU → Dense
|
|
dense1 = Dense(input_size=4, output_size=8)
|
|
relu = ReLU()
|
|
dense2 = Dense(input_size=8, output_size=3)
|
|
|
|
# Forward pass
|
|
hidden = dense1(batch_data)
|
|
activated = relu(hidden)
|
|
output = dense2(activated)
|
|
|
|
# Verify all outputs are tensors
|
|
assert isinstance(hidden, Tensor), "Hidden layer should be Tensor"
|
|
assert isinstance(activated, Tensor), "Activated layer should be Tensor"
|
|
assert isinstance(output, Tensor), "Output layer should be Tensor"
|
|
|
|
# Verify shapes through pipeline
|
|
assert hidden.shape == (6, 8), f"Hidden shape should be (6, 8), got {hidden.shape}"
|
|
assert activated.shape == (6, 8), f"Activated shape should be (6, 8), got {activated.shape}"
|
|
assert output.shape == (6, 3), f"Output shape should be (6, 3), got {output.shape}"
|
|
|
|
|
|
class TestDataLoaderTensorBatching:
|
|
"""Test DataLoader batching with tensor integration."""
|
|
|
|
def test_different_batch_sizes(self):
|
|
"""Test DataLoader with different batch sizes produces correct tensors."""
|
|
dataset = SimpleDataset(size=20, num_features=3, num_classes=2)
|
|
|
|
batch_sizes = [1, 4, 8, 10]
|
|
for batch_size in batch_sizes:
|
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Verify tensor shapes
|
|
assert batch_data.shape == (batch_size, 3), f"Data shape should be ({batch_size}, 3), got {batch_data.shape}"
|
|
assert batch_labels.shape == (batch_size,), f"Label shape should be ({batch_size},), got {batch_labels.shape}"
|
|
|
|
# Verify tensor types
|
|
assert isinstance(batch_data, Tensor), "Batch data should be Tensor"
|
|
assert isinstance(batch_labels, Tensor), "Batch labels should be Tensor"
|
|
|
|
def test_shuffling_preserves_tensor_integrity(self):
|
|
"""Test that shuffling preserves tensor data integrity."""
|
|
dataset = SimpleDataset(size=10, num_features=2, num_classes=2)
|
|
|
|
# Create two dataloaders with different shuffle settings
|
|
dataloader_no_shuffle = DataLoader(dataset, batch_size=5, shuffle=False)
|
|
dataloader_shuffle = DataLoader(dataset, batch_size=5, shuffle=True)
|
|
|
|
# Get batches
|
|
batch_no_shuffle = next(iter(dataloader_no_shuffle))
|
|
batch_shuffle = next(iter(dataloader_shuffle))
|
|
|
|
# Both should produce valid tensors
|
|
for batch_data, batch_labels in [batch_no_shuffle, batch_shuffle]:
|
|
assert isinstance(batch_data, Tensor), "Data should be Tensor"
|
|
assert isinstance(batch_labels, Tensor), "Labels should be Tensor"
|
|
assert batch_data.shape == (5, 2), f"Expected shape (5, 2), got {batch_data.shape}"
|
|
assert batch_labels.shape == (5,), f"Expected shape (5,), got {batch_labels.shape}"
|
|
|
|
def test_iteration_produces_consistent_tensors(self):
|
|
"""Test that iterating through DataLoader produces consistent tensors."""
|
|
dataset = SimpleDataset(size=12, num_features=3, num_classes=2)
|
|
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
|
|
|
|
batch_count = 0
|
|
for batch_data, batch_labels in dataloader:
|
|
batch_count += 1
|
|
|
|
# Verify each batch produces valid tensors
|
|
assert isinstance(batch_data, Tensor), f"Batch {batch_count} data should be Tensor"
|
|
assert isinstance(batch_labels, Tensor), f"Batch {batch_count} labels should be Tensor"
|
|
|
|
# Verify shapes (last batch might be smaller)
|
|
assert batch_data.shape[1] == 3, f"Feature dim should be 3, got {batch_data.shape[1]}"
|
|
assert batch_data.shape[0] == batch_labels.shape[0], "Batch and label sizes should match"
|
|
|
|
# Verify data types
|
|
assert batch_data.dtype == np.float32, "Data should be float32"
|
|
assert batch_labels.dtype == np.int32, "Labels should be int32"
|
|
|
|
# Should have processed all data
|
|
assert batch_count == 3, f"Expected 3 batches, got {batch_count}"
|
|
|
|
|
|
class TestDataLoaderTensorDataTypes:
|
|
"""Test DataLoader tensor data type handling."""
|
|
|
|
def test_float32_tensor_production(self):
|
|
"""Test DataLoader produces float32 tensors for data."""
|
|
dataset = SimpleDataset(size=8, num_features=2, num_classes=2)
|
|
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
|
|
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Verify data types
|
|
assert batch_data.dtype == np.float32, f"Expected float32, got {batch_data.dtype}"
|
|
assert isinstance(batch_data.data, np.ndarray), "Underlying data should be numpy array"
|
|
assert batch_data.data.dtype == np.float32, "Underlying array should be float32"
|
|
|
|
def test_int32_tensor_production(self):
|
|
"""Test DataLoader produces int32 tensors for labels."""
|
|
dataset = SimpleDataset(size=8, num_features=2, num_classes=3)
|
|
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
|
|
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Verify label types
|
|
assert batch_labels.dtype == np.int32, f"Expected int32, got {batch_labels.dtype}"
|
|
assert isinstance(batch_labels.data, np.ndarray), "Underlying labels should be numpy array"
|
|
assert batch_labels.data.dtype == np.int32, "Underlying array should be int32"
|
|
|
|
def test_tensor_data_ranges(self):
|
|
"""Test DataLoader produces tensors with reasonable data ranges."""
|
|
dataset = SimpleDataset(size=10, num_features=3, num_classes=2)
|
|
dataloader = DataLoader(dataset, batch_size=5, shuffle=False)
|
|
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Check data ranges
|
|
assert np.all(np.isfinite(batch_data.data)), "Data should be finite"
|
|
assert np.all(batch_labels.data >= 0), "Labels should be non-negative"
|
|
assert np.all(batch_labels.data < 2), "Labels should be less than num_classes"
|
|
|
|
|
|
class TestDataLoaderTensorRealisticScenarios:
|
|
"""Test DataLoader with realistic tensor scenarios."""
|
|
|
|
def test_training_loop_simulation(self):
|
|
"""Test DataLoader tensors in training loop simulation."""
|
|
dataset = SimpleDataset(size=16, num_features=4, num_classes=2)
|
|
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
|
|
|
|
# Simulate training loop
|
|
epoch_batches = 0
|
|
for epoch in range(2):
|
|
batch_count = 0
|
|
for batch_data, batch_labels in dataloader:
|
|
batch_count += 1
|
|
|
|
# Simulate forward pass
|
|
dense = Dense(input_size=4, output_size=2)
|
|
output = dense(batch_data)
|
|
|
|
# Verify tensor operations work
|
|
assert isinstance(output, Tensor), "Forward pass should produce Tensor"
|
|
assert output.shape == (8, 2), f"Expected shape (8, 2), got {output.shape}"
|
|
|
|
# Simulate loss computation (simplified)
|
|
loss = output.data.mean() # Simple loss
|
|
assert np.isfinite(loss), "Loss should be finite"
|
|
|
|
epoch_batches += 1
|
|
|
|
assert batch_count == 2, f"Expected 2 batches per epoch, got {batch_count}"
|
|
|
|
assert epoch_batches == 4, f"Expected 4 total batches, got {epoch_batches}"
|
|
|
|
def test_different_dataset_sizes(self):
|
|
"""Test DataLoader with different dataset sizes."""
|
|
test_cases = [
|
|
(5, 2), # Small dataset
|
|
(32, 8), # Medium dataset
|
|
(100, 16), # Large dataset
|
|
]
|
|
|
|
for dataset_size, batch_size in test_cases:
|
|
dataset = SimpleDataset(size=dataset_size, num_features=3, num_classes=2)
|
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
|
|
|
|
total_samples = 0
|
|
for batch_data, batch_labels in dataloader:
|
|
# Verify tensor properties
|
|
assert isinstance(batch_data, Tensor), "Data should be Tensor"
|
|
assert isinstance(batch_labels, Tensor), "Labels should be Tensor"
|
|
|
|
# Count samples
|
|
total_samples += batch_data.shape[0]
|
|
|
|
# Verify shapes
|
|
assert batch_data.shape[1] == 3, "Feature dim should be 3"
|
|
assert batch_data.shape[0] == batch_labels.shape[0], "Batch sizes should match"
|
|
|
|
assert total_samples == dataset_size, f"Should process all {dataset_size} samples, got {total_samples}"
|
|
|
|
def test_dataloader_with_complex_pipeline(self):
|
|
"""Test DataLoader integration with complex neural network pipeline."""
|
|
dataset = SimpleDataset(size=20, num_features=5, num_classes=3)
|
|
dataloader = DataLoader(dataset, batch_size=10, shuffle=False)
|
|
|
|
# Create complex pipeline
|
|
dense1 = Dense(input_size=5, output_size=16)
|
|
relu1 = ReLU()
|
|
dense2 = Dense(input_size=16, output_size=8)
|
|
relu2 = ReLU()
|
|
dense3 = Dense(input_size=8, output_size=3)
|
|
|
|
# Process batches
|
|
for batch_data, batch_labels in dataloader:
|
|
# Forward pass through complex pipeline
|
|
x = dense1(batch_data)
|
|
x = relu1(x)
|
|
x = dense2(x)
|
|
x = relu2(x)
|
|
output = dense3(x)
|
|
|
|
# Verify final output
|
|
assert isinstance(output, Tensor), "Final output should be Tensor"
|
|
assert output.shape == (10, 3), f"Expected shape (10, 3), got {output.shape}"
|
|
assert output.dtype == np.float32, "Output should be float32"
|
|
assert np.all(np.isfinite(output.data)), "Output should be finite"
|
|
|
|
def test_dataloader_memory_efficiency(self):
|
|
"""Test DataLoader memory efficiency with tensor operations."""
|
|
dataset = SimpleDataset(size=50, num_features=10, num_classes=5)
|
|
dataloader = DataLoader(dataset, batch_size=25, shuffle=False)
|
|
|
|
# Process batches and verify memory usage patterns
|
|
processed_batches = []
|
|
for batch_data, batch_labels in dataloader:
|
|
# Store tensor info (not the actual tensors to avoid memory issues)
|
|
batch_info = {
|
|
'data_shape': batch_data.shape,
|
|
'label_shape': batch_labels.shape,
|
|
'data_type': batch_data.dtype,
|
|
'label_type': batch_labels.dtype
|
|
}
|
|
processed_batches.append(batch_info)
|
|
|
|
# Verify tensors are properly formed
|
|
assert isinstance(batch_data, Tensor), "Data should be Tensor"
|
|
assert isinstance(batch_labels, Tensor), "Labels should be Tensor"
|
|
|
|
# Verify we processed expected number of batches
|
|
assert len(processed_batches) == 2, f"Expected 2 batches, got {len(processed_batches)}"
|
|
|
|
# Verify consistency across batches
|
|
for i, batch_info in enumerate(processed_batches):
|
|
assert batch_info['data_shape'][1] == 10, f"Batch {i} should have 10 features"
|
|
assert batch_info['data_type'] == np.float32, f"Batch {i} data should be float32"
|
|
assert batch_info['label_type'] == np.int32, f"Batch {i} labels should be int32"
|
|
|
|
|
|
class TestCustomDatasetIntegration:
|
|
"""Test custom dataset integration with tensor operations."""
|
|
|
|
def test_custom_dataset_with_tensors(self):
|
|
"""Test custom dataset that produces tensors works with DataLoader."""
|
|
|
|
class CustomTensorDataset(Dataset):
|
|
def __init__(self, size: int):
|
|
self.size = size
|
|
self.data = [Tensor(np.random.rand(3).astype(np.float32)) for _ in range(size)]
|
|
self.labels = [Tensor(np.random.randint(0, 2, dtype=np.int32)) for _ in range(size)]
|
|
|
|
def __len__(self):
|
|
return self.size
|
|
|
|
def __getitem__(self, index):
|
|
return self.data[index], self.labels[index]
|
|
|
|
# Create custom dataset and dataloader
|
|
dataset = CustomTensorDataset(size=12)
|
|
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
|
|
|
|
# Test integration
|
|
batch_data, batch_labels = next(iter(dataloader))
|
|
|
|
# Verify tensor properties
|
|
assert isinstance(batch_data, Tensor), "Batch data should be Tensor"
|
|
assert isinstance(batch_labels, Tensor), "Batch labels should be Tensor"
|
|
assert batch_data.shape == (4, 3), f"Expected shape (4, 3), got {batch_data.shape}"
|
|
assert batch_labels.shape == (4,), f"Expected shape (4,), got {batch_labels.shape}"
|
|
|
|
# Test with neural network components
|
|
dense = Dense(input_size=3, output_size=2)
|
|
output = dense(batch_data)
|
|
|
|
assert isinstance(output, Tensor), "Dense output should be Tensor"
|
|
assert output.shape == (4, 2), f"Expected shape (4, 2), got {output.shape}" |