mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 04:27:32 -05:00
- Export all modules with CIFAR-10 and checkpointing enhancements - Create demo_cifar10_training.py showing complete pipeline - Fix module issues preventing clean imports - Validate all components work together - Confirm students can achieve 75% CIFAR-10 accuracy goal Pipeline validated: ✅ CIFAR-10 dataset downloading ✅ Model creation and training ✅ Checkpointing for best models ✅ Evaluation tools ✅ Complete end-to-end workflow
459 lines
16 KiB
Python
Generated
459 lines
16 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/08_dataloader/dataloader_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['Dataset', 'DataLoader', 'SimpleDataset', 'download_cifar10', 'CIFAR10Dataset']
|
|
|
|
# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 1
|
|
import numpy as np
|
|
import sys
|
|
import os
|
|
from typing import Tuple, Optional, Iterator
|
|
import urllib.request
|
|
import tarfile
|
|
import pickle
|
|
import time
|
|
|
|
# Import our building blocks - try package first, then local modules
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
except ImportError:
|
|
# For development, import from local modules
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
|
|
from tensor_dev import Tensor
|
|
|
|
# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 7
|
|
class Dataset:
|
|
"""
|
|
Base Dataset class: Abstract interface for all datasets.
|
|
|
|
The fundamental abstraction for data loading in TinyTorch.
|
|
Students implement concrete datasets by inheriting from this class.
|
|
"""
|
|
|
|
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
|
"""
|
|
Get a single sample and label by index.
|
|
|
|
Args:
|
|
index: Index of the sample to retrieve
|
|
|
|
Returns:
|
|
Tuple of (data, label) tensors
|
|
|
|
TODO: Implement abstract method for getting samples.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. This is an abstract method - subclasses will implement it
|
|
2. Return a tuple of (data, label) tensors
|
|
3. Data should be the input features, label should be the target
|
|
|
|
EXAMPLE:
|
|
dataset[0] should return (Tensor(image_data), Tensor(label))
|
|
|
|
LEARNING CONNECTIONS:
|
|
- **PyTorch Integration**: This follows the exact same pattern as torch.utils.data.Dataset
|
|
- **Production Data**: Real datasets like ImageNet, CIFAR-10 use this interface
|
|
- **Memory Efficiency**: On-demand loading prevents loading entire dataset into memory
|
|
- **Batching Foundation**: DataLoader uses __getitem__ to create batches efficiently
|
|
|
|
HINTS:
|
|
- This is an abstract method that subclasses must override
|
|
- Always return a tuple of (data, label) tensors
|
|
- Data contains the input features, label contains the target
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# This is an abstract method - subclasses must implement it
|
|
raise NotImplementedError("Subclasses must implement __getitem__")
|
|
### END SOLUTION
|
|
|
|
def __len__(self) -> int:
|
|
"""
|
|
Get the total number of samples in the dataset.
|
|
|
|
TODO: Implement abstract method for getting dataset size.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. This is an abstract method - subclasses will implement it
|
|
2. Return the total number of samples in the dataset
|
|
|
|
EXAMPLE:
|
|
len(dataset) should return 50000 for CIFAR-10 training set
|
|
|
|
LEARNING CONNECTIONS:
|
|
- **Memory Planning**: DataLoader uses len() to calculate number of batches
|
|
- **Progress Tracking**: Training loops use len() for progress bars and epoch calculations
|
|
- **Distributed Training**: Multi-GPU systems need dataset size for work distribution
|
|
- **Statistical Sampling**: Some training strategies require knowing total dataset size
|
|
|
|
HINTS:
|
|
- This is an abstract method that subclasses must override
|
|
- Return an integer representing the total number of samples
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# This is an abstract method - subclasses must implement it
|
|
raise NotImplementedError("Subclasses must implement __len__")
|
|
### END SOLUTION
|
|
|
|
def get_sample_shape(self) -> Tuple[int, ...]:
|
|
"""
|
|
Get the shape of a single data sample.
|
|
|
|
TODO: Implement method to get sample shape.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Get the first sample using self[0]
|
|
2. Extract the data part (first element of tuple)
|
|
3. Return the shape of the data tensor
|
|
|
|
EXAMPLE:
|
|
For CIFAR-10: returns (3, 32, 32) for RGB images
|
|
|
|
LEARNING CONNECTIONS:
|
|
- **Model Architecture**: Neural networks need to know input shape for first layer
|
|
- **Batch Planning**: Systems use sample shape to calculate memory requirements
|
|
- **Preprocessing Validation**: Ensures all samples have consistent shape
|
|
- **Framework Integration**: Similar to PyTorch's dataset shape inspection
|
|
|
|
HINTS:
|
|
- Use self[0] to get the first sample
|
|
- Extract data from the (data, label) tuple
|
|
- Return data.shape
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Get the first sample to determine shape
|
|
data, _ = self[0]
|
|
return data.shape
|
|
### END SOLUTION
|
|
|
|
def get_num_classes(self) -> int:
|
|
"""
|
|
Get the number of classes in the dataset.
|
|
|
|
TODO: Implement abstract method for getting number of classes.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. This is an abstract method - subclasses will implement it
|
|
2. Return the number of unique classes in the dataset
|
|
|
|
EXAMPLE:
|
|
For CIFAR-10: returns 10 (classes 0-9)
|
|
|
|
LEARNING CONNECTIONS:
|
|
- **Output Layer Design**: Neural networks need num_classes for final layer size
|
|
- **Loss Function Setup**: CrossEntropyLoss uses num_classes for proper computation
|
|
- **Evaluation Metrics**: Accuracy calculation depends on number of classes
|
|
- **Model Validation**: Ensures model predictions match expected class range
|
|
|
|
HINTS:
|
|
- This is an abstract method that subclasses must override
|
|
- Return the number of unique classes/categories
|
|
"""
|
|
# This is an abstract method - subclasses must implement it
|
|
raise NotImplementedError("Subclasses must implement get_num_classes")
|
|
|
|
# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 11
|
|
class DataLoader:
|
|
"""
|
|
DataLoader: Efficiently batch and iterate through datasets.
|
|
|
|
Provides batching, shuffling, and efficient iteration over datasets.
|
|
Essential for training neural networks efficiently.
|
|
"""
|
|
|
|
def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
|
|
"""
|
|
Initialize DataLoader.
|
|
|
|
Args:
|
|
dataset: Dataset to load from
|
|
batch_size: Number of samples per batch
|
|
shuffle: Whether to shuffle data each epoch
|
|
|
|
TODO: Store configuration and dataset.
|
|
|
|
APPROACH:
|
|
1. Store dataset as self.dataset
|
|
2. Store batch_size as self.batch_size
|
|
3. Store shuffle as self.shuffle
|
|
|
|
EXAMPLE:
|
|
DataLoader(dataset, batch_size=32, shuffle=True)
|
|
|
|
HINTS:
|
|
- Store all parameters as instance variables
|
|
- These will be used in __iter__ for batching
|
|
"""
|
|
# Input validation
|
|
if dataset is None:
|
|
raise TypeError("Dataset cannot be None")
|
|
if not isinstance(batch_size, int) or batch_size <= 0:
|
|
raise ValueError(f"Batch size must be a positive integer, got {batch_size}")
|
|
|
|
self.dataset = dataset
|
|
self.batch_size = batch_size
|
|
self.shuffle = shuffle
|
|
|
|
def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
|
|
"""
|
|
Iterate through dataset in batches.
|
|
|
|
Returns:
|
|
Iterator yielding (batch_data, batch_labels) tuples
|
|
|
|
TODO: Implement batching and shuffling logic.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Create indices list: list(range(len(dataset)))
|
|
2. Shuffle indices if self.shuffle is True
|
|
3. Loop through indices in batch_size chunks
|
|
4. For each batch: collect samples, stack them, yield batch
|
|
|
|
EXAMPLE:
|
|
for batch_data, batch_labels in dataloader:
|
|
# batch_data.shape: (batch_size, ...)
|
|
# batch_labels.shape: (batch_size,)
|
|
|
|
LEARNING CONNECTIONS:
|
|
- **GPU Efficiency**: Batching maximizes GPU utilization by processing multiple samples together
|
|
- **Training Stability**: Shuffling prevents overfitting to data order and improves generalization
|
|
- **Memory Management**: Batches fit in GPU memory while full dataset may not
|
|
- **Gradient Estimation**: Batch gradients provide better estimates than single-sample gradients
|
|
|
|
HINTS:
|
|
- Use list(range(len(self.dataset))) for indices
|
|
- Use np.random.shuffle() if self.shuffle is True
|
|
- Loop in chunks of self.batch_size
|
|
- Collect samples and stack with np.stack()
|
|
"""
|
|
# Create indices for all samples
|
|
indices = list(range(len(self.dataset)))
|
|
|
|
# Shuffle if requested
|
|
if self.shuffle:
|
|
np.random.shuffle(indices)
|
|
|
|
# Iterate through indices in batches
|
|
for i in range(0, len(indices), self.batch_size):
|
|
batch_indices = indices[i:i + self.batch_size]
|
|
|
|
# Collect samples for this batch
|
|
batch_data = []
|
|
batch_labels = []
|
|
|
|
for idx in batch_indices:
|
|
data, label = self.dataset[idx]
|
|
batch_data.append(data.data)
|
|
batch_labels.append(label.data)
|
|
|
|
# Stack into batch tensors
|
|
batch_data_array = np.stack(batch_data, axis=0)
|
|
batch_labels_array = np.stack(batch_labels, axis=0)
|
|
|
|
yield Tensor(batch_data_array), Tensor(batch_labels_array)
|
|
|
|
def __len__(self) -> int:
|
|
"""
|
|
Get the number of batches per epoch.
|
|
|
|
TODO: Calculate number of batches.
|
|
|
|
APPROACH:
|
|
1. Get dataset size: len(self.dataset)
|
|
2. Divide by batch_size and round up
|
|
3. Use ceiling division: (n + batch_size - 1) // batch_size
|
|
|
|
EXAMPLE:
|
|
Dataset size 100, batch size 32 → 4 batches
|
|
|
|
HINTS:
|
|
- Use len(self.dataset) for dataset size
|
|
- Use ceiling division for exact batch count
|
|
- Formula: (dataset_size + batch_size - 1) // batch_size
|
|
"""
|
|
# Calculate number of batches using ceiling division
|
|
dataset_size = len(self.dataset)
|
|
return (dataset_size + self.batch_size - 1) // self.batch_size
|
|
|
|
# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 15
|
|
class SimpleDataset(Dataset):
|
|
"""
|
|
Simple dataset for testing and demonstration.
|
|
|
|
Generates synthetic data with configurable size and properties.
|
|
Perfect for understanding the Dataset pattern.
|
|
"""
|
|
|
|
def __init__(self, size: int = 100, num_features: int = 4, num_classes: int = 3):
|
|
"""
|
|
Initialize SimpleDataset.
|
|
|
|
Args:
|
|
size: Number of samples in the dataset
|
|
num_features: Number of features per sample
|
|
num_classes: Number of classes
|
|
|
|
TODO: Initialize the dataset with synthetic data.
|
|
|
|
APPROACH:
|
|
1. Store the configuration parameters
|
|
2. Generate synthetic data and labels
|
|
3. Make data deterministic for testing
|
|
|
|
EXAMPLE:
|
|
SimpleDataset(size=100, num_features=4, num_classes=3)
|
|
creates 100 samples with 4 features each, 3 classes
|
|
|
|
HINTS:
|
|
- Store size, num_features, num_classes as instance variables
|
|
- Use np.random.seed() for reproducible data
|
|
- Generate random data with np.random.randn()
|
|
- Generate random labels with np.random.randint()
|
|
"""
|
|
self.size = size
|
|
self.num_features = num_features
|
|
self.num_classes = num_classes
|
|
|
|
# Generate synthetic data (deterministic for testing)
|
|
np.random.seed(42) # For reproducible data
|
|
self.data = np.random.randn(size, num_features).astype(np.float32)
|
|
self.labels = np.random.randint(0, num_classes, size=size)
|
|
|
|
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
|
"""
|
|
Get a sample by index.
|
|
|
|
Args:
|
|
index: Index of the sample
|
|
|
|
Returns:
|
|
Tuple of (data, label) tensors
|
|
|
|
TODO: Return the sample at the given index.
|
|
|
|
APPROACH:
|
|
1. Get data sample from self.data[index]
|
|
2. Get label from self.labels[index]
|
|
3. Convert both to Tensors and return as tuple
|
|
|
|
EXAMPLE:
|
|
dataset[0] returns (Tensor(features), Tensor(label))
|
|
|
|
HINTS:
|
|
- Use self.data[index] for the data
|
|
- Use self.labels[index] for the label
|
|
- Convert to Tensors: Tensor(data), Tensor(label)
|
|
"""
|
|
data = self.data[index]
|
|
label = self.labels[index]
|
|
return Tensor(data), Tensor(label)
|
|
|
|
def __len__(self) -> int:
|
|
"""
|
|
Get the dataset size.
|
|
|
|
TODO: Return the dataset size.
|
|
|
|
APPROACH:
|
|
1. Return self.size
|
|
|
|
EXAMPLE:
|
|
len(dataset) returns 100 for dataset with 100 samples
|
|
|
|
HINTS:
|
|
- Simply return self.size
|
|
"""
|
|
return self.size
|
|
|
|
def get_num_classes(self) -> int:
|
|
"""
|
|
Get the number of classes.
|
|
|
|
TODO: Return the number of classes.
|
|
|
|
APPROACH:
|
|
1. Return self.num_classes
|
|
|
|
EXAMPLE:
|
|
dataset.get_num_classes() returns 3 for 3-class dataset
|
|
|
|
HINTS:
|
|
- Simply return self.num_classes
|
|
"""
|
|
return self.num_classes
|
|
|
|
# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 17
|
|
def download_cifar10(root: str = "./data") -> str:
|
|
"""
|
|
Download CIFAR-10 dataset.
|
|
|
|
TODO: Download and extract CIFAR-10.
|
|
|
|
HINTS:
|
|
- URL: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
|
|
- Use urllib.request.urlretrieve()
|
|
- Extract with tarfile
|
|
"""
|
|
### BEGIN SOLUTION
|
|
os.makedirs(root, exist_ok=True)
|
|
dataset_dir = os.path.join(root, "cifar-10-batches-py")
|
|
|
|
if os.path.exists(dataset_dir):
|
|
print(f"✅ CIFAR-10 found at {dataset_dir}")
|
|
return dataset_dir
|
|
|
|
url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
|
|
tar_path = os.path.join(root, "cifar-10.tar.gz")
|
|
|
|
print(f"📥 Downloading CIFAR-10 (~170MB)...")
|
|
urllib.request.urlretrieve(url, tar_path)
|
|
print("✅ Downloaded!")
|
|
|
|
print("📦 Extracting...")
|
|
with tarfile.open(tar_path, 'r:gz') as tar:
|
|
tar.extractall(root)
|
|
print("✅ Ready!")
|
|
|
|
return dataset_dir
|
|
### END SOLUTION
|
|
|
|
class CIFAR10Dataset(Dataset):
|
|
"""CIFAR-10 dataset for CNN training."""
|
|
|
|
def __init__(self, root="./data", train=True, download=False):
|
|
"""Load CIFAR-10 data."""
|
|
### BEGIN SOLUTION
|
|
if download:
|
|
dataset_dir = download_cifar10(root)
|
|
else:
|
|
dataset_dir = os.path.join(root, "cifar-10-batches-py")
|
|
|
|
if train:
|
|
data_list = []
|
|
label_list = []
|
|
for i in range(1, 6):
|
|
with open(os.path.join(dataset_dir, f"data_batch_{i}"), 'rb') as f:
|
|
batch = pickle.load(f, encoding='bytes')
|
|
data_list.append(batch[b'data'])
|
|
label_list.extend(batch[b'labels'])
|
|
self.data = np.concatenate(data_list)
|
|
self.labels = np.array(label_list)
|
|
else:
|
|
with open(os.path.join(dataset_dir, "test_batch"), 'rb') as f:
|
|
batch = pickle.load(f, encoding='bytes')
|
|
self.data = batch[b'data']
|
|
self.labels = np.array(batch[b'labels'])
|
|
|
|
# Reshape to (N, 3, 32, 32) and normalize
|
|
self.data = self.data.reshape(-1, 3, 32, 32).astype(np.float32) / 255.0
|
|
print(f"✅ Loaded {len(self.data):,} images")
|
|
### END SOLUTION
|
|
|
|
def __getitem__(self, idx):
|
|
return Tensor(self.data[idx]), Tensor(self.labels[idx])
|
|
|
|
def __len__(self):
|
|
return len(self.data)
|
|
|
|
def get_num_classes(self):
|
|
return 10
|