Files
TinyTorch/tests/module_06/test_dataloader_tensor_integration.py
Vijay Janapa Reddi 2f23f757e7 MAJOR: Implement beautiful module progression through strategic reordering
This commit implements the pedagogically optimal "inevitable discovery" module progression based on expert validation and educational design principles.

## Module Reordering Summary

**Previous Order (Problems)**:
- 05_losses → 06_autograd → 07_dataloader → 08_optimizers → 09_spatial → 10_training
- Issues: Autograd before optimizers, DataLoader before training, scattered dependencies

**New Order (Beautiful Progression)**:
- 05_losses → 06_optimizers → 07_autograd → 08_training → 09_spatial → 10_dataloader
- Benefits: Each module creates inevitable need for the next

## Pedagogical Flow Achieved

**05_losses** → "Need systematic weight updates" → **06_optimizers**
**06_optimizers** → "Need automatic gradients" → **07_autograd**
**07_autograd** → "Need systematic training" → **08_training**
**08_training** → "MLPs hit limits on images" → **09_spatial**
**09_spatial** → "Training is too slow" → **10_dataloader**

## Technical Changes

### Module Directory Renaming
- `06_autograd` → `07_autograd`
- `07_dataloader` → `10_dataloader`
- `08_optimizers` → `06_optimizers`
- `10_training` → `08_training`
- `09_spatial` → `09_spatial` (no change)

### System Integration Updates
- **MODULE_TO_CHECKPOINT mapping**: Updated in tito/commands/export.py
- **Test directories**: Renamed module_XX directories to match new numbers
- **Documentation**: Updated all references in MD files and agent configurations
- **CLI integration**: Updated next-steps suggestions for proper flow

### Agent Configuration Updates
- **Quality Assurance**: Updated module audit status with new numbers
- **Module Developer**: Updated work tracking with new sequence
- **Documentation**: Updated MASTER_PLAN_OF_RECORD.md with beautiful progression

## Educational Benefits

1. **Inevitable Discovery**: Each module naturally leads to the next
2. **Cognitive Load**: Concepts introduced exactly when needed
3. **Motivation**: Students understand WHY each tool is necessary
4. **Synthesis**: Everything flows toward complete ML systems understanding
5. **Professional Alignment**: Matches real ML engineering workflows

## Quality Assurance

-  All CLI commands still function
-  Checkpoint system mappings updated
-  Documentation consistency maintained
-  Test directory structure aligned
-  Agent configurations synchronized

**Impact**: This reordering transforms TinyTorch from a collection of modules into a coherent educational journey where each step naturally motivates the next, creating optimal conditions for deep learning systems understanding.
2025-09-24 15:56:47 -04:00

390 lines
18 KiB
Python

"""
Integration Tests - DataLoader and Tensor
Tests real integration between DataLoader and Tensor modules.
Uses actual TinyTorch components to verify they work together correctly.
"""
import pytest
import numpy as np
from test_utils import setup_integration_test
# Ensure proper setup before importing
setup_integration_test()
# Import ONLY from TinyTorch package
from tinytorch.core.tensor import Tensor
from tinytorch.core.dataloader import DataLoader, Dataset, SimpleDataset
from tinytorch.core.activations import ReLU
from tinytorch.core.layers import Dense
class TestDataLoaderTensorIntegration:
"""Test integration between DataLoader and Tensor components."""
def test_simple_dataset_produces_tensors(self):
"""Test SimpleDataset produces real Tensor objects."""
# Create SimpleDataset
dataset = SimpleDataset(size=10, num_features=3, num_classes=2)
# Get a sample
data, label = dataset[0]
# Verify outputs are tensors
assert isinstance(data, Tensor), "Data should be a Tensor"
assert isinstance(label, Tensor), "Label should be a Tensor"
# Verify tensor properties
assert data.shape == (3,), f"Expected data shape (3,), got {data.shape}"
assert label.shape == (), f"Expected label shape (), got {label.shape}"
assert data.dtype == np.float32, f"Expected float32, got {data.dtype}"
assert label.dtype == np.int32, f"Expected int32, got {label.dtype}"
def test_dataloader_produces_tensor_batches(self):
"""Test DataLoader produces batches of real Tensor objects."""
# Create dataset and dataloader
dataset = SimpleDataset(size=20, num_features=4, num_classes=3)
dataloader = DataLoader(dataset, batch_size=5, shuffle=False)
# Get first batch
batch_data, batch_labels = next(iter(dataloader))
# Verify batch outputs are tensors
assert isinstance(batch_data, Tensor), "Batch data should be a Tensor"
assert isinstance(batch_labels, Tensor), "Batch labels should be a Tensor"
# Verify batch shapes
assert batch_data.shape == (5, 4), f"Expected batch data shape (5, 4), got {batch_data.shape}"
assert batch_labels.shape == (5,), f"Expected batch labels shape (5,), got {batch_labels.shape}"
# Verify data types
assert batch_data.dtype == np.float32, f"Expected float32, got {batch_data.dtype}"
assert batch_labels.dtype == np.int32, f"Expected int32, got {batch_labels.dtype}"
def test_dataloader_tensor_compatibility_with_activations(self):
"""Test DataLoader tensors work with activation functions."""
# Create dataset and dataloader
dataset = SimpleDataset(size=10, num_features=3, num_classes=2)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
# Get batch
batch_data, batch_labels = next(iter(dataloader))
# Apply activation function
relu = ReLU()
activated_data = relu(batch_data)
# Verify result is tensor
assert isinstance(activated_data, Tensor), "Activated data should be a Tensor"
assert activated_data.shape == batch_data.shape, "Shape should be preserved"
# Verify ReLU applied correctly (non-negative values)
assert np.all(activated_data.data >= 0), "ReLU should produce non-negative values"
def test_dataloader_tensor_compatibility_with_layers(self):
"""Test DataLoader tensors work with neural network layers."""
# Create dataset and dataloader
dataset = SimpleDataset(size=10, num_features=3, num_classes=2)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
# Get batch
batch_data, batch_labels = next(iter(dataloader))
# Apply dense layer
dense = Dense(input_size=3, output_size=2)
output = dense(batch_data)
# Verify result is tensor
assert isinstance(output, Tensor), "Layer output should be a Tensor"
assert output.shape == (4, 2), f"Expected output shape (4, 2), got {output.shape}"
assert output.dtype == np.float32, f"Expected float32, got {output.dtype}"
def test_dataloader_full_pipeline_integration(self):
"""Test DataLoader tensors in complete ML pipeline."""
# Create dataset and dataloader
dataset = SimpleDataset(size=12, num_features=4, num_classes=3)
dataloader = DataLoader(dataset, batch_size=6, shuffle=False)
# Get batch
batch_data, batch_labels = next(iter(dataloader))
# Apply full pipeline: Dense → ReLU → Dense
dense1 = Dense(input_size=4, output_size=8)
relu = ReLU()
dense2 = Dense(input_size=8, output_size=3)
# Forward pass
hidden = dense1(batch_data)
activated = relu(hidden)
output = dense2(activated)
# Verify all outputs are tensors
assert isinstance(hidden, Tensor), "Hidden layer should be Tensor"
assert isinstance(activated, Tensor), "Activated layer should be Tensor"
assert isinstance(output, Tensor), "Output layer should be Tensor"
# Verify shapes through pipeline
assert hidden.shape == (6, 8), f"Hidden shape should be (6, 8), got {hidden.shape}"
assert activated.shape == (6, 8), f"Activated shape should be (6, 8), got {activated.shape}"
assert output.shape == (6, 3), f"Output shape should be (6, 3), got {output.shape}"
class TestDataLoaderTensorBatching:
"""Test DataLoader batching with tensor integration."""
def test_different_batch_sizes(self):
"""Test DataLoader with different batch sizes produces correct tensors."""
dataset = SimpleDataset(size=20, num_features=3, num_classes=2)
batch_sizes = [1, 4, 8, 10]
for batch_size in batch_sizes:
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
batch_data, batch_labels = next(iter(dataloader))
# Verify tensor shapes
assert batch_data.shape == (batch_size, 3), f"Data shape should be ({batch_size}, 3), got {batch_data.shape}"
assert batch_labels.shape == (batch_size,), f"Label shape should be ({batch_size},), got {batch_labels.shape}"
# Verify tensor types
assert isinstance(batch_data, Tensor), "Batch data should be Tensor"
assert isinstance(batch_labels, Tensor), "Batch labels should be Tensor"
def test_shuffling_preserves_tensor_integrity(self):
"""Test that shuffling preserves tensor data integrity."""
dataset = SimpleDataset(size=10, num_features=2, num_classes=2)
# Create two dataloaders with different shuffle settings
dataloader_no_shuffle = DataLoader(dataset, batch_size=5, shuffle=False)
dataloader_shuffle = DataLoader(dataset, batch_size=5, shuffle=True)
# Get batches
batch_no_shuffle = next(iter(dataloader_no_shuffle))
batch_shuffle = next(iter(dataloader_shuffle))
# Both should produce valid tensors
for batch_data, batch_labels in [batch_no_shuffle, batch_shuffle]:
assert isinstance(batch_data, Tensor), "Data should be Tensor"
assert isinstance(batch_labels, Tensor), "Labels should be Tensor"
assert batch_data.shape == (5, 2), f"Expected shape (5, 2), got {batch_data.shape}"
assert batch_labels.shape == (5,), f"Expected shape (5,), got {batch_labels.shape}"
def test_iteration_produces_consistent_tensors(self):
"""Test that iterating through DataLoader produces consistent tensors."""
dataset = SimpleDataset(size=12, num_features=3, num_classes=2)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
batch_count = 0
for batch_data, batch_labels in dataloader:
batch_count += 1
# Verify each batch produces valid tensors
assert isinstance(batch_data, Tensor), f"Batch {batch_count} data should be Tensor"
assert isinstance(batch_labels, Tensor), f"Batch {batch_count} labels should be Tensor"
# Verify shapes (last batch might be smaller)
assert batch_data.shape[1] == 3, f"Feature dim should be 3, got {batch_data.shape[1]}"
assert batch_data.shape[0] == batch_labels.shape[0], "Batch and label sizes should match"
# Verify data types
assert batch_data.dtype == np.float32, "Data should be float32"
assert batch_labels.dtype == np.int32, "Labels should be int32"
# Should have processed all data
assert batch_count == 3, f"Expected 3 batches, got {batch_count}"
class TestDataLoaderTensorDataTypes:
"""Test DataLoader tensor data type handling."""
def test_float32_tensor_production(self):
"""Test DataLoader produces float32 tensors for data."""
dataset = SimpleDataset(size=8, num_features=2, num_classes=2)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
batch_data, batch_labels = next(iter(dataloader))
# Verify data types
assert batch_data.dtype == np.float32, f"Expected float32, got {batch_data.dtype}"
assert isinstance(batch_data.data, np.ndarray), "Underlying data should be numpy array"
assert batch_data.data.dtype == np.float32, "Underlying array should be float32"
def test_int32_tensor_production(self):
"""Test DataLoader produces int32 tensors for labels."""
dataset = SimpleDataset(size=8, num_features=2, num_classes=3)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
batch_data, batch_labels = next(iter(dataloader))
# Verify label types
assert batch_labels.dtype == np.int32, f"Expected int32, got {batch_labels.dtype}"
assert isinstance(batch_labels.data, np.ndarray), "Underlying labels should be numpy array"
assert batch_labels.data.dtype == np.int32, "Underlying array should be int32"
def test_tensor_data_ranges(self):
"""Test DataLoader produces tensors with reasonable data ranges."""
dataset = SimpleDataset(size=10, num_features=3, num_classes=2)
dataloader = DataLoader(dataset, batch_size=5, shuffle=False)
batch_data, batch_labels = next(iter(dataloader))
# Check data ranges
assert np.all(np.isfinite(batch_data.data)), "Data should be finite"
assert np.all(batch_labels.data >= 0), "Labels should be non-negative"
assert np.all(batch_labels.data < 2), "Labels should be less than num_classes"
class TestDataLoaderTensorRealisticScenarios:
"""Test DataLoader with realistic tensor scenarios."""
def test_training_loop_simulation(self):
"""Test DataLoader tensors in training loop simulation."""
dataset = SimpleDataset(size=16, num_features=4, num_classes=2)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
# Simulate training loop
epoch_batches = 0
for epoch in range(2):
batch_count = 0
for batch_data, batch_labels in dataloader:
batch_count += 1
# Simulate forward pass
dense = Dense(input_size=4, output_size=2)
output = dense(batch_data)
# Verify tensor operations work
assert isinstance(output, Tensor), "Forward pass should produce Tensor"
assert output.shape == (8, 2), f"Expected shape (8, 2), got {output.shape}"
# Simulate loss computation (simplified)
loss = output.data.mean() # Simple loss
assert np.isfinite(loss), "Loss should be finite"
epoch_batches += 1
assert batch_count == 2, f"Expected 2 batches per epoch, got {batch_count}"
assert epoch_batches == 4, f"Expected 4 total batches, got {epoch_batches}"
def test_different_dataset_sizes(self):
"""Test DataLoader with different dataset sizes."""
test_cases = [
(5, 2), # Small dataset
(32, 8), # Medium dataset
(100, 16), # Large dataset
]
for dataset_size, batch_size in test_cases:
dataset = SimpleDataset(size=dataset_size, num_features=3, num_classes=2)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
total_samples = 0
for batch_data, batch_labels in dataloader:
# Verify tensor properties
assert isinstance(batch_data, Tensor), "Data should be Tensor"
assert isinstance(batch_labels, Tensor), "Labels should be Tensor"
# Count samples
total_samples += batch_data.shape[0]
# Verify shapes
assert batch_data.shape[1] == 3, "Feature dim should be 3"
assert batch_data.shape[0] == batch_labels.shape[0], "Batch sizes should match"
assert total_samples == dataset_size, f"Should process all {dataset_size} samples, got {total_samples}"
def test_dataloader_with_complex_pipeline(self):
"""Test DataLoader integration with complex neural network pipeline."""
dataset = SimpleDataset(size=20, num_features=5, num_classes=3)
dataloader = DataLoader(dataset, batch_size=10, shuffle=False)
# Create complex pipeline
dense1 = Dense(input_size=5, output_size=16)
relu1 = ReLU()
dense2 = Dense(input_size=16, output_size=8)
relu2 = ReLU()
dense3 = Dense(input_size=8, output_size=3)
# Process batches
for batch_data, batch_labels in dataloader:
# Forward pass through complex pipeline
x = dense1(batch_data)
x = relu1(x)
x = dense2(x)
x = relu2(x)
output = dense3(x)
# Verify final output
assert isinstance(output, Tensor), "Final output should be Tensor"
assert output.shape == (10, 3), f"Expected shape (10, 3), got {output.shape}"
assert output.dtype == np.float32, "Output should be float32"
assert np.all(np.isfinite(output.data)), "Output should be finite"
def test_dataloader_memory_efficiency(self):
"""Test DataLoader memory efficiency with tensor operations."""
dataset = SimpleDataset(size=50, num_features=10, num_classes=5)
dataloader = DataLoader(dataset, batch_size=25, shuffle=False)
# Process batches and verify memory usage patterns
processed_batches = []
for batch_data, batch_labels in dataloader:
# Store tensor info (not the actual tensors to avoid memory issues)
batch_info = {
'data_shape': batch_data.shape,
'label_shape': batch_labels.shape,
'data_type': batch_data.dtype,
'label_type': batch_labels.dtype
}
processed_batches.append(batch_info)
# Verify tensors are properly formed
assert isinstance(batch_data, Tensor), "Data should be Tensor"
assert isinstance(batch_labels, Tensor), "Labels should be Tensor"
# Verify we processed expected number of batches
assert len(processed_batches) == 2, f"Expected 2 batches, got {len(processed_batches)}"
# Verify consistency across batches
for i, batch_info in enumerate(processed_batches):
assert batch_info['data_shape'][1] == 10, f"Batch {i} should have 10 features"
assert batch_info['data_type'] == np.float32, f"Batch {i} data should be float32"
assert batch_info['label_type'] == np.int32, f"Batch {i} labels should be int32"
class TestCustomDatasetIntegration:
"""Test custom dataset integration with tensor operations."""
def test_custom_dataset_with_tensors(self):
"""Test custom dataset that produces tensors works with DataLoader."""
class CustomTensorDataset(Dataset):
def __init__(self, size: int):
self.size = size
self.data = [Tensor(np.random.rand(3).astype(np.float32)) for _ in range(size)]
self.labels = [Tensor(np.random.randint(0, 2, dtype=np.int32)) for _ in range(size)]
def __len__(self):
return self.size
def __getitem__(self, index):
return self.data[index], self.labels[index]
# Create custom dataset and dataloader
dataset = CustomTensorDataset(size=12)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
# Test integration
batch_data, batch_labels = next(iter(dataloader))
# Verify tensor properties
assert isinstance(batch_data, Tensor), "Batch data should be Tensor"
assert isinstance(batch_labels, Tensor), "Batch labels should be Tensor"
assert batch_data.shape == (4, 3), f"Expected shape (4, 3), got {batch_data.shape}"
assert batch_labels.shape == (4,), f"Expected shape (4,), got {batch_labels.shape}"
# Test with neural network components
dense = Dense(input_size=3, output_size=2)
output = dense(batch_data)
assert isinstance(output, Tensor), "Dense output should be Tensor"
assert output.shape == (4, 2), f"Expected shape (4, 2), got {output.shape}"