Finalize Module 08 and add integration tests

Added integration tests for DataLoader:
- test_dataloader_integration.py in tests/integration/
  - Training workflow integration
  - Shuffle consistency across epochs
  - Memory efficiency verification

Updated Module 08:
- Added note about optional performance analysis
- Clarified that analysis functions can be run manually
- Clean flow: text → code → tests

Updated datasets/tiny/README.md:
- Minor formatting fixes

Module 08 is now complete and ready to export:
 Dataset abstraction
 TensorDataset implementation
 DataLoader with batching/shuffling
 ASCII visualizations for understanding
 Unit tests (in module)
 Integration tests (in tests/)
 Performance analysis tools (optional)

Next: Export with 'bin/tito export 08_dataloader'
This commit is contained in:
Vijay Janapa Reddi
2025-09-30 16:07:55 -04:00
parent 779c47ed7a
commit 97fece7b5f
3 changed files with 154 additions and 3 deletions

View File

@@ -0,0 +1,149 @@
"""
Integration tests for DataLoader with training workflows.
These tests verify that DataLoader works correctly when integrated with
actual training pipelines, not just in isolation.
"""
import numpy as np
import sys
import os
# Add project root to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
from tinytorch import Tensor
from tinytorch.data.loader import Dataset, TensorDataset, DataLoader
def test_training_workflow_integration():
"""
Test DataLoader integration with realistic training workflow.
Simulates:
- Train/val split
- DataLoader creation
- Batch iteration
- Complete epoch processing
"""
print("🔬 Integration Test: DataLoader + Training Workflow...")
# Create synthetic dataset (simulate real data)
num_samples = 1000
num_features = 20
num_classes = 5
features = np.random.randn(num_samples, num_features).astype(np.float32)
labels = np.random.randint(0, num_classes, num_samples).astype(np.int64)
dataset_full = TensorDataset(Tensor(features), Tensor(labels))
# Split into train/val (80/20 split)
train_size = int(0.8 * num_samples)
val_size = num_samples - train_size
train_samples = [dataset_full[i] for i in range(train_size)]
val_samples = [dataset_full[i] for i in range(train_size, num_samples)]
# Create tensors from samples
train_features = Tensor(np.stack([sample[0].data for sample in train_samples]))
train_labels = Tensor(np.stack([sample[1].data for sample in train_samples]))
val_features = Tensor(np.stack([sample[0].data for sample in val_samples]))
val_labels = Tensor(np.stack([sample[1].data for sample in val_samples]))
train_dataset = TensorDataset(train_features, train_labels)
val_dataset = TensorDataset(val_features, val_labels)
# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
print(f"📊 Dataset splits:")
print(f" Training: {len(train_dataset)} samples, {len(train_loader)} batches")
print(f" Validation: {len(val_dataset)} samples, {len(val_loader)} batches")
# Simulate training loop
print("\n🏃 Simulated Training Loop:")
epoch_samples = 0
batch_count = 0
for batch_idx, (batch_features, batch_labels) in enumerate(train_loader):
batch_count += 1
epoch_samples += len(batch_features.data)
# Simulate forward pass (just check shapes)
assert batch_features.data.shape[0] <= batch_size, "Batch size exceeded"
assert batch_features.data.shape[1] == num_features, "Wrong feature count"
assert len(batch_labels.data) == len(batch_features.data), "Mismatched batch sizes"
if batch_idx < 3: # Show first few batches
print(f" Batch {batch_idx + 1}: {batch_features.data.shape[0]} samples")
print(f" Total: {batch_count} batches, {epoch_samples} samples processed")
# Validate that all samples were seen
assert epoch_samples == len(train_dataset), f"Expected {len(train_dataset)}, processed {epoch_samples}"
print("✅ Training workflow integration works correctly!")
def test_dataloader_shuffle_consistency():
"""Test that shuffle produces different orders across epochs."""
print("\n🔬 Integration Test: Shuffle Consistency...")
# Create simple sequential dataset
data = Tensor(np.arange(100).reshape(-1, 1).astype(np.float32))
labels = Tensor(np.arange(100).astype(np.int64))
dataset = TensorDataset(data, labels)
loader = DataLoader(dataset, batch_size=10, shuffle=True)
# Get first batch from two epochs
epoch1_first = next(iter(loader))[0].data
epoch2_first = next(iter(loader))[0].data
# Should be different due to shuffle (very high probability)
different = not np.array_equal(epoch1_first, epoch2_first)
assert different, "Shuffle should produce different orders across epochs"
print("✅ Shuffle produces different orders across epochs")
def test_dataloader_memory_efficiency():
"""Test that DataLoader doesn't load entire dataset into memory at once."""
print("\n🔬 Integration Test: Memory Efficiency...")
# Create large-ish dataset
large_size = 10000
features = Tensor(np.random.randn(large_size, 50).astype(np.float32))
labels = Tensor(np.random.randint(0, 10, large_size).astype(np.int64))
dataset = TensorDataset(features, labels)
loader = DataLoader(dataset, batch_size=64, shuffle=False)
# Should be able to iterate without loading all at once
batch_count = 0
for batch in loader:
batch_count += 1
# Check batch is reasonable size
assert batch[0].data.shape[0] <= 64
if batch_count > 10: # Just verify first few batches
break
print(f"✅ Processed {batch_count} batches without loading entire dataset")
if __name__ == "__main__":
print("=" * 60)
print("🧪 DATALOADER INTEGRATION TESTS")
print("=" * 60)
test_training_workflow_integration()
test_dataloader_shuffle_consistency()
test_dataloader_memory_efficiency()
print("\n" + "=" * 60)
print("🎉 ALL INTEGRATION TESTS PASSED!")
print("=" * 60)