Files
TinyTorch/modules/06_dataloader/tests/test_dataloader.py
Vijay Janapa Reddi 27208e3492 🏗️ Restructure repository for optimal student/instructor experience
- Move development artifacts to development/archived/ directory
- Remove NBGrader artifacts (assignments/, testing/, gradebook.db, logs)
- Update root README.md to match actual repository structure
- Provide clear navigation paths for instructors and students
- Remove outdated documentation references
- Clean root directory while preserving essential files
- Maintain all functionality while improving organization

Repository is now optimally structured for classroom use with clear entry points:
- Instructors: docs/INSTRUCTOR_GUIDE.md
- Students: docs/STUDENT_GUIDE.md
- Developers: docs/development/

 All functionality verified working after restructuring
2025-07-12 11:17:36 -04:00

428 lines
16 KiB
Python

"""
Test suite for the dataloader module.
This tests the student implementations to ensure they work correctly.
"""
import pytest
import numpy as np
import sys
import os
import tempfile
import shutil
import pickle
from pathlib import Path
from unittest.mock import patch, MagicMock
# Import from the main package (rock solid foundation)
from tinytorch.core.tensor import Tensor
from tinytorch.core.dataloader import Dataset, DataLoader, CIFAR10Dataset, Normalizer, create_data_pipeline
def safe_numpy(tensor):
"""Get numpy array from tensor, using .data attribute"""
return tensor.data
def safe_item(tensor):
"""Get scalar value from tensor"""
return float(tensor.data)
class TestCIFAR10Dataset(Dataset):
"""Test dataset that uses local test data instead of downloading CIFAR-10."""
def __init__(self, root_dir: str, train: bool = True, download: bool = True):
"""Initialize with local test data."""
self.root_dir = root_dir
self.train = train
self.download = download
# Use local test data
test_data_dir = Path(__file__).parent / "test_data"
if not test_data_dir.exists():
raise FileNotFoundError(f"Test data not found at {test_data_dir}")
self._load_test_data(test_data_dir)
def _load_test_data(self, data_dir):
"""Load the small test dataset."""
# Load metadata
with open(data_dir / "batches.meta", "rb") as f:
meta_dict = pickle.load(f)
self.class_names = [name.decode() for name in meta_dict[b'label_names']]
# Load training or test data
if self.train:
with open(data_dir / "data_batch_1", "rb") as f:
data_dict = pickle.load(f)
else:
with open(data_dir / "test_batch", "rb") as f:
data_dict = pickle.load(f)
# Reshape data from (N, 3072) to (N, 3, 32, 32)
self.data = data_dict[b'data'].reshape(-1, 3, 32, 32)
self.labels = data_dict[b'labels']
def __getitem__(self, index: int):
"""Get a single sample and label."""
image = self.data[index]
label = self.labels[index]
return Tensor(image.astype(np.float32)), Tensor(np.array(label))
def __len__(self) -> int:
"""Get the total number of samples."""
return len(self.data)
def get_num_classes(self) -> int:
"""Get the number of classes."""
return len(self.class_names)
class TestDatasetInterface:
"""Test the base Dataset class interface (abstract class behavior)."""
def test_dataset_is_abstract(self):
"""Test that Dataset base class is abstract."""
dataset = Dataset()
# Should raise NotImplementedError for abstract methods
with pytest.raises(NotImplementedError):
dataset[0]
with pytest.raises(NotImplementedError):
len(dataset)
with pytest.raises(NotImplementedError):
dataset.get_num_classes()
def test_concrete_dataset_implementation(self):
"""Test that concrete datasets work properly."""
class TestDataset(Dataset):
def __init__(self, size=10):
self.size = size
self.data = [np.random.randn(3, 32, 32) for _ in range(size)]
self.labels = [i % 3 for i in range(size)]
def __getitem__(self, index):
return Tensor(self.data[index]), Tensor(np.array(self.labels[index]))
def __len__(self):
return self.size
def get_num_classes(self):
return 3
dataset = TestDataset(5)
# Test basic functionality
assert len(dataset) == 5
assert dataset.get_num_classes() == 3
# Test indexing
sample, label = dataset[0]
assert sample.shape == (3, 32, 32)
assert label.shape == ()
# Test get_sample_shape
assert dataset.get_sample_shape() == (3, 32, 32)
class TestLocalCIFAR10Dataset:
"""Test CIFAR-10 dataset with local test data."""
def test_cifar10_train_set_load(self):
"""Test loading training set from local test data."""
with tempfile.TemporaryDirectory() as temp_dir:
# Use local test data
dataset = TestCIFAR10Dataset(temp_dir, train=True, download=True)
# Verify basic properties
assert len(dataset) == 50 # Our test training set size
assert dataset.get_num_classes() == 10
# Test sample access
image, label = dataset[0]
assert image.shape == (3, 32, 32) # CIFAR-10 image shape
assert 0 <= safe_item(label) < 10 # Valid class label
# Test class names
assert len(dataset.class_names) == 10
assert 'airplane' in dataset.class_names
assert 'truck' in dataset.class_names
def test_cifar10_test_set_load(self):
"""Test loading test set from local test data."""
with tempfile.TemporaryDirectory() as temp_dir:
# Use local test data
dataset = TestCIFAR10Dataset(temp_dir, train=False, download=True)
# Verify test set properties
assert len(dataset) == 20 # Our test test set size
assert dataset.get_num_classes() == 10
# Test sample access
image, label = dataset[0]
assert image.shape == (3, 32, 32)
assert 0 <= safe_item(label) < 10
def test_cifar10_data_types(self):
"""Test that test data has correct types and ranges."""
with tempfile.TemporaryDirectory() as temp_dir:
dataset = TestCIFAR10Dataset(temp_dir, train=True, download=True)
# Test first few samples
for i in range(5):
image, label = dataset[i]
# Check data types
assert isinstance(image, Tensor)
assert isinstance(label, Tensor)
# Check value ranges (our test data uses 0-255 range)
assert 0 <= safe_numpy(image).min() <= 255
assert 0 <= safe_numpy(image).max() <= 255
# Check label is valid class
assert 0 <= safe_item(label) < 10
class TestDataLoader:
"""Test DataLoader with local test data."""
def setup_method(self):
"""Set up local test dataset for DataLoader tests."""
self.temp_dir = tempfile.mkdtemp()
# Use local test data
self.dataset = TestCIFAR10Dataset(self.temp_dir, train=True, download=True)
def teardown_method(self):
"""Clean up temporary directory."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_dataloader_creation(self):
"""Test DataLoader creation with local test data."""
# Test with default parameters
loader = DataLoader(self.dataset, batch_size=16)
assert len(loader) == 4 # 50 samples / 16 batch_size = 4 batches (rounded up)
# Test with custom batch size
loader = DataLoader(self.dataset, batch_size=10)
assert len(loader) == 5 # 50 samples / 10 batch_size = 5 batches
def test_dataloader_iteration_test_data(self):
"""Test DataLoader iteration with local test data."""
loader = DataLoader(self.dataset, batch_size=8, shuffle=True)
batch_count = 0
total_samples = 0
for batch_data, batch_labels in loader:
batch_count += 1
batch_size = batch_data.shape[0]
total_samples += batch_size
# Check batch shapes
assert batch_data.shape[1:] == (3, 32, 32) # CIFAR-10 image shape
assert batch_labels.shape == (batch_size,)
# Check data types
assert isinstance(batch_data, Tensor)
assert isinstance(batch_labels, Tensor)
# Check test data properties
assert 0 <= safe_numpy(batch_data).min() <= 255
assert 0 <= safe_numpy(batch_data).max() <= 255
assert 0 <= safe_numpy(batch_labels).min() < 10
assert 0 <= safe_numpy(batch_labels).max() < 10
# Check batch size
assert batch_size <= 8
if batch_count >= 3: # Test first few batches
break
assert batch_count > 0
assert total_samples <= len(self.dataset)
def test_dataloader_shuffling_test_data(self):
"""Test that shuffling works with test data."""
loader1 = DataLoader(self.dataset, batch_size=10, shuffle=True)
loader2 = DataLoader(self.dataset, batch_size=10, shuffle=True)
# Get first batch from each loader
batch1_data, batch1_labels = next(iter(loader1))
batch2_data, batch2_labels = next(iter(loader2))
# With shuffling, batches should likely be different
# (This test might occasionally fail due to randomness, but very unlikely)
different = not np.array_equal(safe_numpy(batch1_labels), safe_numpy(batch2_labels))
# Note: We don't assert this because random shuffling might occasionally produce same order
def test_dataloader_no_shuffle_test_data(self):
"""Test DataLoader without shuffling uses test data in order."""
loader = DataLoader(self.dataset, batch_size=10, shuffle=False)
# Get first batch
batch_data, batch_labels = next(iter(loader))
# Without shuffling, should get first 10 samples in order
expected_samples = [self.dataset[i] for i in range(10)]
expected_labels = [safe_item(sample[1]) for sample in expected_samples]
np.testing.assert_array_equal(safe_numpy(batch_labels), expected_labels)
class TestNormalizer:
"""Test Normalizer with local test data."""
def setup_method(self):
"""Set up local test data for normalization tests."""
self.temp_dir = tempfile.mkdtemp()
dataset = TestCIFAR10Dataset(self.temp_dir, train=True, download=True)
# Get first 20 samples for testing
self.test_data = []
for i in range(20):
image, _ = dataset[i]
self.test_data.append(image)
def teardown_method(self):
"""Clean up temporary directory."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_normalizer_fit_test_data(self):
"""Test Normalizer fit with local test data."""
normalizer = Normalizer()
normalizer.fit(self.test_data)
# Check computed statistics
assert normalizer.mean is not None
assert normalizer.std is not None
# Our test data has pixel values 0-255, so mean should be reasonable
assert 0 <= normalizer.mean <= 255
assert normalizer.std > 0 # Should have some variation
def test_normalizer_transform_test_data(self):
"""Test Normalizer transform with local test data."""
normalizer = Normalizer()
normalizer.fit(self.test_data)
# Transform single sample
sample = self.test_data[0]
normalized = normalizer.transform(sample)
# Check that normalization changes the values
assert not np.allclose(safe_numpy(sample), safe_numpy(normalized))
# Check that normalized data has different statistics
original_mean = np.mean(safe_numpy(sample))
normalized_mean = np.mean(safe_numpy(normalized))
assert abs(normalized_mean) < abs(original_mean) # Should be closer to 0
def test_normalizer_transform_batch_test_data(self):
"""Test Normalizer with batch of test data."""
normalizer = Normalizer()
normalizer.fit(self.test_data)
# Transform batch
batch = self.test_data[:5]
normalized_batch = normalizer.transform(batch)
# Check that we get same number of samples
assert len(normalized_batch) == len(batch)
# Check that each sample is normalized
for original, normalized in zip(batch, normalized_batch):
assert not np.allclose(safe_numpy(original), safe_numpy(normalized))
class TestDataPipeline:
"""Test complete data pipeline with local test data."""
def test_create_data_pipeline_test_data(self):
"""Test creating data pipeline with local test data."""
with tempfile.TemporaryDirectory() as temp_dir:
# Copy test data to temp directory
test_data_dir = Path(__file__).parent / "test_data"
import shutil
shutil.copytree(test_data_dir, temp_dir + "/test_data")
# Create pipeline (this would normally download CIFAR-10)
# For testing, we'll create a simple pipeline manually
dataset = TestCIFAR10Dataset(temp_dir, train=True, download=True)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
# Test pipeline components
assert len(dataset) == 50 # Our test training set
assert len(dataloader) == 7 # 50 samples / 8 batch_size = 7 batches
# Test that we can iterate through the pipeline
batch_count = 0
for batch_data, batch_labels in dataloader:
batch_count += 1
assert batch_data.shape[1:] == (3, 32, 32)
assert batch_labels.shape[0] <= 8
if batch_count >= 3: # Test first few batches
break
assert batch_count > 0
def test_pipeline_normalization_test_data(self):
"""Test pipeline with normalization using local test data."""
with tempfile.TemporaryDirectory() as temp_dir:
dataset = TestCIFAR10Dataset(temp_dir, train=True, download=True)
# Get some samples for normalization
samples = [dataset[i][0] for i in range(10)]
# Create and fit normalizer
normalizer = Normalizer()
normalizer.fit(samples)
# Test that normalization works
normalized = normalizer.transform(samples[0])
assert not np.allclose(safe_numpy(samples[0]), safe_numpy(normalized))
# Test with dataloader
dataloader = DataLoader(dataset, batch_size=5, shuffle=False)
batch_data, batch_labels = next(iter(dataloader))
# Normalize batch
normalized_batch = []
for i in range(batch_data.shape[0]):
sample = Tensor(batch_data.data[i])
normalized_sample = normalizer.transform(sample)
normalized_batch.append(normalized_sample.data)
normalized_batch = Tensor(np.stack(normalized_batch))
# Check that batch normalization works
assert normalized_batch.shape == batch_data.shape
assert not np.allclose(safe_numpy(batch_data), safe_numpy(normalized_batch))
class TestEdgeCases:
"""Test edge cases with local test data."""
def test_small_batch_size_test_data(self):
"""Test with very small batch size using local test data."""
with tempfile.TemporaryDirectory() as temp_dir:
# Create small dataset
dataset = TestCIFAR10Dataset(temp_dir, train=True, download=True)
# Use batch size of 1
loader = DataLoader(dataset, batch_size=1, shuffle=False)
# Test first few batches
batch_count = 0
for batch_data, batch_labels in loader:
assert batch_data.shape == (1, 3, 32, 32)
assert batch_labels.shape == (1,)
batch_count += 1
if batch_count >= 5:
break
assert batch_count == 5
def run_data_tests():
"""Run all data tests."""
pytest.main([__file__, "-v"])
if __name__ == "__main__":
run_data_tests()