mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-29 20:08:33 -05:00
refactor: rename data module to dataloader
- Rename modules/data/ → modules/dataloader/ - Rename data_dev.py → dataloader_dev.py - Update NBDev export target: core.data → core.dataloader - Rename test files: test_data.py → test_dataloader.py - Update package exports to tinytorch.core.dataloader - Update module imports and internal references This makes the module name more descriptive and aligned with ML industry standards.
This commit is contained in:
@@ -23,6 +23,50 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Tanh.forward': ( 'activations/activations_dev.html#tanh.forward',
|
||||
'tinytorch/core/activations.py')},
|
||||
'tinytorch.core.dataloader': { 'tinytorch.core.dataloader.CIFAR10Dataset': ( 'dataloader/dataloader_dev.html#cifar10dataset',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.CIFAR10Dataset.__getitem__': ( 'dataloader/dataloader_dev.html#cifar10dataset.__getitem__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.CIFAR10Dataset.__init__': ( 'dataloader/dataloader_dev.html#cifar10dataset.__init__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.CIFAR10Dataset.__len__': ( 'dataloader/dataloader_dev.html#cifar10dataset.__len__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.CIFAR10Dataset._download_if_needed': ( 'dataloader/dataloader_dev.html#cifar10dataset._download_if_needed',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.CIFAR10Dataset._load_data': ( 'dataloader/dataloader_dev.html#cifar10dataset._load_data',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.CIFAR10Dataset.get_num_classes': ( 'dataloader/dataloader_dev.html#cifar10dataset.get_num_classes',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.DataLoader': ( 'dataloader/dataloader_dev.html#dataloader',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.DataLoader.__init__': ( 'dataloader/dataloader_dev.html#dataloader.__init__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.DataLoader.__iter__': ( 'dataloader/dataloader_dev.html#dataloader.__iter__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.DataLoader.__len__': ( 'dataloader/dataloader_dev.html#dataloader.__len__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Dataset': ( 'dataloader/dataloader_dev.html#dataset',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Dataset.__getitem__': ( 'dataloader/dataloader_dev.html#dataset.__getitem__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Dataset.__len__': ( 'dataloader/dataloader_dev.html#dataset.__len__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Dataset.get_num_classes': ( 'dataloader/dataloader_dev.html#dataset.get_num_classes',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Dataset.get_sample_shape': ( 'dataloader/dataloader_dev.html#dataset.get_sample_shape',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Normalizer': ( 'dataloader/dataloader_dev.html#normalizer',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Normalizer.__init__': ( 'dataloader/dataloader_dev.html#normalizer.__init__',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Normalizer.fit': ( 'dataloader/dataloader_dev.html#normalizer.fit',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.Normalizer.transform': ( 'dataloader/dataloader_dev.html#normalizer.transform',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader._should_show_plots': ( 'dataloader/dataloader_dev.html#_should_show_plots',
|
||||
'tinytorch/core/dataloader.py'),
|
||||
'tinytorch.core.dataloader.create_data_pipeline': ( 'dataloader/dataloader_dev.html#create_data_pipeline',
|
||||
'tinytorch/core/dataloader.py')},
|
||||
'tinytorch.core.layers': { 'tinytorch.core.layers.Dense': ('layers/layers_dev.html#dense', 'tinytorch/core/layers.py'),
|
||||
'tinytorch.core.layers.Dense.__call__': ( 'layers/layers_dev.html#dense.__call__',
|
||||
'tinytorch/core/layers.py'),
|
||||
|
||||
694
tinytorch/core/dataloader.py
Normal file
694
tinytorch/core/dataloader.py
Normal file
@@ -0,0 +1,694 @@
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/dataloader/dataloader_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Dataset', 'CIFAR10Dataset', 'DataLoader', 'Normalizer', 'create_data_pipeline']
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 4
|
||||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
import pickle
|
||||
import struct
|
||||
from typing import List, Tuple, Optional, Union, Iterator
|
||||
import matplotlib.pyplot as plt
|
||||
import urllib.request
|
||||
import tarfile
|
||||
|
||||
# Import our building blocks
|
||||
from .tensor import Tensor
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 5
|
||||
def _should_show_plots():
|
||||
"""Check if we should show plots (disable during testing)"""
|
||||
return 'pytest' not in sys.modules and 'test' not in sys.argv
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 7
|
||||
class Dataset:
|
||||
"""
|
||||
Base Dataset class: Abstract interface for all datasets.
|
||||
|
||||
The fundamental abstraction for data loading in TinyTorch.
|
||||
Students implement concrete datasets by inheriting from this class.
|
||||
|
||||
TODO: Implement the base Dataset class with required methods.
|
||||
|
||||
APPROACH:
|
||||
1. Define the interface that all datasets must implement
|
||||
2. Include methods for getting individual samples and dataset size
|
||||
3. Make it easy to extend for different data types
|
||||
|
||||
EXAMPLE:
|
||||
dataset = CIFAR10Dataset("data/cifar10/")
|
||||
sample, label = dataset[0] # Get first sample
|
||||
size = len(dataset) # Get dataset size
|
||||
|
||||
HINTS:
|
||||
- Use abstract methods that subclasses must implement
|
||||
- Include __getitem__ for indexing and __len__ for size
|
||||
- Add helper methods for getting sample shapes and number of classes
|
||||
"""
|
||||
|
||||
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Get a single sample and label by index.
|
||||
|
||||
Args:
|
||||
index: Index of the sample to retrieve
|
||||
|
||||
Returns:
|
||||
Tuple of (data, label) tensors
|
||||
|
||||
TODO: Implement abstract method for getting samples.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. This is an abstract method - subclasses will implement it
|
||||
2. Return a tuple of (data, label) tensors
|
||||
3. Data should be the input features, label should be the target
|
||||
|
||||
EXAMPLE:
|
||||
dataset[0] should return (Tensor(image_data), Tensor(label))
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""
|
||||
Get the total number of samples in the dataset.
|
||||
|
||||
TODO: Implement abstract method for getting dataset size.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. This is an abstract method - subclasses will implement it
|
||||
2. Return the total number of samples in the dataset
|
||||
|
||||
EXAMPLE:
|
||||
len(dataset) should return 50000 for CIFAR-10 training set
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def get_sample_shape(self) -> Tuple[int, ...]:
|
||||
"""
|
||||
Get the shape of a single data sample.
|
||||
|
||||
TODO: Implement method to get sample shape.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Get the first sample using self[0]
|
||||
2. Extract the data part (first element of tuple)
|
||||
3. Return the shape of the data tensor
|
||||
|
||||
EXAMPLE:
|
||||
For CIFAR-10: returns (3, 32, 32) for RGB images
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def get_num_classes(self) -> int:
|
||||
"""
|
||||
Get the number of classes in the dataset.
|
||||
|
||||
TODO: Implement abstract method for getting number of classes.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. This is an abstract method - subclasses will implement it
|
||||
2. Return the total number of classes in the dataset
|
||||
|
||||
EXAMPLE:
|
||||
For CIFAR-10: returns 10 (airplane, car, bird, cat, deer, dog, frog, horse, ship, truck)
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 8
|
||||
class Dataset:
|
||||
"""Base Dataset class: Abstract interface for all datasets."""
|
||||
|
||||
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
||||
"""Get a single sample and label by index."""
|
||||
raise NotImplementedError("Subclasses must implement __getitem__")
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get the total number of samples in the dataset."""
|
||||
raise NotImplementedError("Subclasses must implement __len__")
|
||||
|
||||
def get_sample_shape(self) -> Tuple[int, ...]:
|
||||
"""Get the shape of a single data sample."""
|
||||
sample, _ = self[0]
|
||||
return sample.shape
|
||||
|
||||
def get_num_classes(self) -> int:
|
||||
"""Get the number of classes in the dataset."""
|
||||
raise NotImplementedError("Subclasses must implement get_num_classes")
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 12
|
||||
class CIFAR10Dataset(Dataset):
|
||||
"""
|
||||
CIFAR-10 Dataset: Load and manage CIFAR-10 image data.
|
||||
|
||||
CIFAR-10 contains 60,000 32x32 color images in 10 classes.
|
||||
Perfect for learning data loading and image processing.
|
||||
|
||||
Args:
|
||||
root_dir: Directory containing CIFAR-10 files
|
||||
train: If True, load training data. If False, load test data.
|
||||
download: If True, download dataset if not present
|
||||
|
||||
TODO: Implement CIFAR-10 dataset loading.
|
||||
|
||||
APPROACH:
|
||||
1. Handle dataset download if needed (with progress bar!)
|
||||
2. Parse binary files to extract images and labels
|
||||
3. Store data efficiently in memory
|
||||
4. Implement indexing and size methods
|
||||
|
||||
EXAMPLE:
|
||||
dataset = CIFAR10Dataset("data/cifar10/", train=True)
|
||||
image, label = dataset[0] # Get first image
|
||||
print(f"Image shape: {image.shape}") # (3, 32, 32)
|
||||
print(f"Label: {label}") # Tensor with class index
|
||||
|
||||
HINTS:
|
||||
- Use pickle to load binary files
|
||||
- Each batch file contains 'data' and 'labels' keys
|
||||
- Reshape data to (3, 32, 32) format
|
||||
- Store images and labels as separate lists
|
||||
- Add progress bar with urllib.request.urlretrieve(url, filename, reporthook=progress_function)
|
||||
- Progress function receives (block_num, block_size, total_size) parameters
|
||||
"""
|
||||
|
||||
def __init__(self, root_dir: str, train: bool = True, download: bool = True):
|
||||
"""
|
||||
Initialize CIFAR-10 dataset.
|
||||
|
||||
Args:
|
||||
root_dir: Directory to store/load dataset
|
||||
train: If True, load training data. If False, load test data.
|
||||
download: If True, download dataset if not present
|
||||
|
||||
TODO: Implement CIFAR-10 initialization.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Create root directory if it doesn't exist
|
||||
2. Download dataset if needed and not present (with progress bar!)
|
||||
3. Load binary files and parse data
|
||||
4. Store images and labels in memory
|
||||
5. Set up class names
|
||||
|
||||
EXAMPLE:
|
||||
CIFAR10Dataset("data/cifar10/", train=True)
|
||||
creates a dataset with 50,000 training images
|
||||
|
||||
PROGRESS BAR HINT:
|
||||
def show_progress(block_num, block_size, total_size):
|
||||
downloaded = block_num * block_size
|
||||
percent = (downloaded * 100) // total_size
|
||||
print(f"\\rDownloading: {percent}%", end='', flush=True)
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Get a single image and label by index.
|
||||
|
||||
Args:
|
||||
index: Index of the sample to retrieve
|
||||
|
||||
Returns:
|
||||
Tuple of (image, label) tensors
|
||||
|
||||
TODO: Implement sample retrieval.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Get image from self.images[index]
|
||||
2. Get label from self.labels[index]
|
||||
3. Return (Tensor(image), Tensor(label))
|
||||
|
||||
EXAMPLE:
|
||||
image, label = dataset[0]
|
||||
image.shape should be (3, 32, 32)
|
||||
label should be integer 0-9
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""
|
||||
Get the total number of samples in the dataset.
|
||||
|
||||
TODO: Return the length of the dataset.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Return len(self.images)
|
||||
|
||||
EXAMPLE:
|
||||
Training set: 50,000 samples
|
||||
Test set: 10,000 samples
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def get_num_classes(self) -> int:
|
||||
"""
|
||||
Get the number of classes in CIFAR-10.
|
||||
|
||||
TODO: Return the number of classes.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. CIFAR-10 has 10 classes
|
||||
2. Return 10
|
||||
|
||||
EXAMPLE:
|
||||
Returns 10 for CIFAR-10
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 13
|
||||
class CIFAR10Dataset(Dataset):
|
||||
"""CIFAR-10 Dataset: Load and manage CIFAR-10 image data."""
|
||||
|
||||
def __init__(self, root_dir: str, train: bool = True, download: bool = True):
|
||||
self.root_dir = root_dir
|
||||
self.train = train
|
||||
self.class_names = ['airplane', 'car', 'bird', 'cat', 'deer',
|
||||
'dog', 'frog', 'horse', 'ship', 'truck']
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
os.makedirs(root_dir, exist_ok=True)
|
||||
|
||||
# Download if needed
|
||||
if download:
|
||||
self._download_if_needed()
|
||||
|
||||
# Load data
|
||||
self._load_data()
|
||||
|
||||
def _download_if_needed(self):
|
||||
"""Download CIFAR-10 if not present."""
|
||||
cifar_path = os.path.join(self.root_dir, "cifar-10-batches-py")
|
||||
if not os.path.exists(cifar_path):
|
||||
print("🔄 Downloading CIFAR-10 dataset...")
|
||||
print("📦 Size: ~170MB (this may take a few minutes)")
|
||||
url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
|
||||
filename = os.path.join(self.root_dir, "cifar-10-python.tar.gz")
|
||||
|
||||
try:
|
||||
# Download with progress bar
|
||||
def show_progress(block_num, block_size, total_size):
|
||||
"""Show download progress bar."""
|
||||
downloaded = block_num * block_size
|
||||
if total_size > 0:
|
||||
percent = min(100, (downloaded * 100) // total_size)
|
||||
bar_length = 50
|
||||
filled_length = (percent * bar_length) // 100
|
||||
bar = '█' * filled_length + '░' * (bar_length - filled_length)
|
||||
|
||||
# Convert bytes to MB
|
||||
downloaded_mb = downloaded / (1024 * 1024)
|
||||
total_mb = total_size / (1024 * 1024)
|
||||
|
||||
print(f"\r📥 [{bar}] {percent}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)", end='', flush=True)
|
||||
else:
|
||||
# Fallback if total size unknown
|
||||
downloaded_mb = downloaded / (1024 * 1024)
|
||||
print(f"\r📥 Downloaded: {downloaded_mb:.1f} MB", end='', flush=True)
|
||||
|
||||
urllib.request.urlretrieve(url, filename, reporthook=show_progress)
|
||||
print() # New line after progress bar
|
||||
|
||||
# Extract
|
||||
print("📂 Extracting CIFAR-10 files...")
|
||||
with tarfile.open(filename, 'r:gz') as tar:
|
||||
tar.extractall(self.root_dir, filter='data')
|
||||
|
||||
# Clean up
|
||||
os.remove(filename)
|
||||
print("✅ CIFAR-10 downloaded and extracted successfully!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Download failed: {e}")
|
||||
print("Please download CIFAR-10 manually from https://www.cs.toronto.edu/~kriz/cifar.html")
|
||||
|
||||
def _load_data(self):
|
||||
"""Load CIFAR-10 data from binary files."""
|
||||
cifar_path = os.path.join(self.root_dir, "cifar-10-batches-py")
|
||||
|
||||
self.images = []
|
||||
self.labels = []
|
||||
|
||||
if self.train:
|
||||
# Load training batches
|
||||
for i in range(1, 6):
|
||||
batch_file = os.path.join(cifar_path, f"data_batch_{i}")
|
||||
if os.path.exists(batch_file):
|
||||
with open(batch_file, 'rb') as f:
|
||||
batch = pickle.load(f, encoding='bytes')
|
||||
# Convert bytes keys to strings
|
||||
batch = {k.decode('utf-8') if isinstance(k, bytes) else k: v for k, v in batch.items()}
|
||||
|
||||
# Extract images and labels
|
||||
images = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
|
||||
labels = batch['labels']
|
||||
|
||||
self.images.extend(images)
|
||||
self.labels.extend(labels)
|
||||
else:
|
||||
# Load test batch
|
||||
test_file = os.path.join(cifar_path, "test_batch")
|
||||
if os.path.exists(test_file):
|
||||
with open(test_file, 'rb') as f:
|
||||
batch = pickle.load(f, encoding='bytes')
|
||||
# Convert bytes keys to strings
|
||||
batch = {k.decode('utf-8') if isinstance(k, bytes) else k: v for k, v in batch.items()}
|
||||
|
||||
# Extract images and labels
|
||||
self.images = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
|
||||
self.labels = batch['labels']
|
||||
|
||||
print(f"✅ Loaded {len(self.images)} {'training' if self.train else 'test'} samples")
|
||||
|
||||
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
||||
"""Get a single image and label by index."""
|
||||
image = Tensor(self.images[index])
|
||||
label = Tensor(np.array(self.labels[index]))
|
||||
return image, label
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get the total number of samples in the dataset."""
|
||||
return len(self.images)
|
||||
|
||||
def get_num_classes(self) -> int:
|
||||
"""Get the number of classes in CIFAR-10."""
|
||||
return 10
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 21
|
||||
class DataLoader:
|
||||
"""
|
||||
DataLoader: Efficiently batch and iterate through datasets.
|
||||
|
||||
Provides batching, shuffling, and efficient iteration over datasets.
|
||||
Essential for training neural networks efficiently.
|
||||
|
||||
Args:
|
||||
dataset: Dataset to load from
|
||||
batch_size: Number of samples per batch
|
||||
shuffle: Whether to shuffle data each epoch
|
||||
|
||||
TODO: Implement DataLoader with batching and shuffling.
|
||||
|
||||
APPROACH:
|
||||
1. Store dataset and configuration
|
||||
2. Implement __iter__ to yield batches
|
||||
3. Handle shuffling and batching logic
|
||||
4. Stack individual samples into batches
|
||||
|
||||
EXAMPLE:
|
||||
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
|
||||
for batch_images, batch_labels in dataloader:
|
||||
print(f"Batch shape: {batch_images.shape}") # (32, 3, 32, 32)
|
||||
|
||||
HINTS:
|
||||
- Use np.random.permutation for shuffling
|
||||
- Stack samples using np.stack
|
||||
- Yield batches as (batch_data, batch_labels)
|
||||
- Handle last batch that might be smaller
|
||||
"""
|
||||
|
||||
def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
|
||||
"""
|
||||
Initialize DataLoader.
|
||||
|
||||
Args:
|
||||
dataset: Dataset to load from
|
||||
batch_size: Number of samples per batch
|
||||
shuffle: Whether to shuffle data each epoch
|
||||
|
||||
TODO: Store configuration and dataset.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Store dataset as self.dataset
|
||||
2. Store batch_size as self.batch_size
|
||||
3. Store shuffle as self.shuffle
|
||||
|
||||
EXAMPLE:
|
||||
DataLoader(dataset, batch_size=32, shuffle=True)
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
|
||||
"""
|
||||
Iterate through dataset in batches.
|
||||
|
||||
Returns:
|
||||
Iterator yielding (batch_data, batch_labels) tuples
|
||||
|
||||
TODO: Implement batching and shuffling logic.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Create indices list: list(range(len(dataset)))
|
||||
2. Shuffle indices if self.shuffle is True
|
||||
3. Loop through indices in batch_size chunks
|
||||
4. For each batch: collect samples, stack them, yield batch
|
||||
|
||||
EXAMPLE:
|
||||
for batch_data, batch_labels in dataloader:
|
||||
# batch_data.shape: (batch_size, ...)
|
||||
# batch_labels.shape: (batch_size,)
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""
|
||||
Get the number of batches per epoch.
|
||||
|
||||
TODO: Calculate number of batches.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Get dataset size: len(self.dataset)
|
||||
2. Calculate: (dataset_size + batch_size - 1) // batch_size
|
||||
3. This handles the last partial batch correctly
|
||||
|
||||
EXAMPLE:
|
||||
Dataset size: 100, batch_size: 32
|
||||
Number of batches: 4 (32, 32, 32, 4)
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 22
|
||||
class DataLoader:
|
||||
"""DataLoader: Efficiently batch and iterate through datasets."""
|
||||
|
||||
def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
|
||||
self.dataset = dataset
|
||||
self.batch_size = batch_size
|
||||
self.shuffle = shuffle
|
||||
|
||||
def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
|
||||
"""Iterate through dataset in batches."""
|
||||
# Create indices
|
||||
indices = list(range(len(self.dataset)))
|
||||
|
||||
# Shuffle if requested
|
||||
if self.shuffle:
|
||||
np.random.shuffle(indices)
|
||||
|
||||
# Generate batches
|
||||
for i in range(0, len(indices), self.batch_size):
|
||||
batch_indices = indices[i:i + self.batch_size]
|
||||
|
||||
# Collect samples for this batch
|
||||
batch_data = []
|
||||
batch_labels = []
|
||||
|
||||
for idx in batch_indices:
|
||||
data, label = self.dataset[idx]
|
||||
batch_data.append(data.data)
|
||||
batch_labels.append(label.data)
|
||||
|
||||
# Stack into batches
|
||||
batch_data = np.stack(batch_data, axis=0)
|
||||
batch_labels = np.stack(batch_labels, axis=0)
|
||||
|
||||
yield Tensor(batch_data), Tensor(batch_labels)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get the number of batches per epoch."""
|
||||
return (len(self.dataset) + self.batch_size - 1) // self.batch_size
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 27
|
||||
class Normalizer:
|
||||
"""
|
||||
Data Normalizer: Standardize data for better training.
|
||||
|
||||
Computes mean and standard deviation from training data,
|
||||
then applies normalization to new data.
|
||||
|
||||
TODO: Implement data normalization.
|
||||
|
||||
APPROACH:
|
||||
1. Fit: Compute mean and std from training data
|
||||
2. Transform: Apply normalization using computed stats
|
||||
3. Handle both single tensors and batches
|
||||
|
||||
EXAMPLE:
|
||||
normalizer = Normalizer()
|
||||
normalizer.fit(training_data) # Compute stats
|
||||
normalized = normalizer.transform(new_data) # Apply normalization
|
||||
|
||||
HINTS:
|
||||
- Store mean and std as instance variables
|
||||
- Use np.mean and np.std for statistics
|
||||
- Apply: (data - mean) / std
|
||||
- Handle division by zero (add small epsilon)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize normalizer.
|
||||
|
||||
TODO: Initialize mean and std to None.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Set self.mean = None
|
||||
2. Set self.std = None
|
||||
3. Set self.epsilon = 1e-8 (for numerical stability)
|
||||
|
||||
EXAMPLE:
|
||||
normalizer = Normalizer()
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def fit(self, data: List[Tensor]):
|
||||
"""
|
||||
Compute normalization statistics from training data.
|
||||
|
||||
Args:
|
||||
data: List of tensors to compute statistics from
|
||||
|
||||
TODO: Compute mean and standard deviation.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Stack all tensors: np.stack([t.data for t in data])
|
||||
2. Compute mean: np.mean(stacked_data)
|
||||
3. Compute std: np.std(stacked_data)
|
||||
4. Store as self.mean and self.std
|
||||
|
||||
EXAMPLE:
|
||||
normalizer.fit([tensor1, tensor2, tensor3])
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
def transform(self, data: Union[Tensor, List[Tensor]]) -> Union[Tensor, List[Tensor]]:
|
||||
"""
|
||||
Apply normalization to data.
|
||||
|
||||
Args:
|
||||
data: Tensor or list of tensors to normalize
|
||||
|
||||
Returns:
|
||||
Normalized tensor(s)
|
||||
|
||||
TODO: Apply normalization using computed statistics.
|
||||
|
||||
STEP-BY-STEP:
|
||||
1. Check if mean and std are computed (not None)
|
||||
2. If single tensor: apply (data - mean) / (std + epsilon)
|
||||
3. If list: apply to each tensor in the list
|
||||
4. Return normalized data
|
||||
|
||||
EXAMPLE:
|
||||
normalized = normalizer.transform(tensor)
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 28
|
||||
class Normalizer:
|
||||
"""Data Normalizer: Standardize data for better training."""
|
||||
|
||||
def __init__(self):
|
||||
self.mean = None
|
||||
self.std = None
|
||||
self.epsilon = 1e-8
|
||||
|
||||
def fit(self, data: List[Tensor]):
|
||||
"""Compute normalization statistics from training data."""
|
||||
# Stack all data
|
||||
all_data = np.stack([t.data for t in data])
|
||||
|
||||
# Compute statistics
|
||||
self.mean = np.mean(all_data)
|
||||
self.std = np.std(all_data)
|
||||
|
||||
print(f"✅ Computed normalization stats: mean={self.mean:.4f}, std={self.std:.4f}")
|
||||
|
||||
def transform(self, data: Union[Tensor, List[Tensor]]) -> Union[Tensor, List[Tensor]]:
|
||||
"""Apply normalization to data."""
|
||||
if self.mean is None or self.std is None:
|
||||
raise ValueError("Must call fit() before transform()")
|
||||
|
||||
if isinstance(data, list):
|
||||
# Transform list of tensors
|
||||
return [Tensor((t.data - self.mean) / (self.std + self.epsilon)) for t in data]
|
||||
else:
|
||||
# Transform single tensor
|
||||
return Tensor((data.data - self.mean) / (self.std + self.epsilon))
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 32
|
||||
def create_data_pipeline(dataset_path: str = "data/cifar10/",
|
||||
batch_size: int = 32,
|
||||
normalize: bool = True,
|
||||
shuffle: bool = True):
|
||||
"""
|
||||
Create a complete data pipeline for training.
|
||||
|
||||
Args:
|
||||
dataset_path: Path to dataset
|
||||
batch_size: Batch size for training
|
||||
normalize: Whether to normalize data
|
||||
shuffle: Whether to shuffle data
|
||||
|
||||
Returns:
|
||||
Tuple of (train_loader, test_loader)
|
||||
|
||||
TODO: Implement complete data pipeline.
|
||||
|
||||
APPROACH:
|
||||
1. Create train and test datasets
|
||||
2. Create data loaders
|
||||
3. Fit normalizer on training data
|
||||
4. Return all components
|
||||
|
||||
EXAMPLE:
|
||||
train_loader, test_loader = create_data_pipeline()
|
||||
for batch_data, batch_labels in train_loader:
|
||||
# Ready for training!
|
||||
"""
|
||||
raise NotImplementedError("Student implementation required")
|
||||
|
||||
# %% ../../modules/dataloader/dataloader_dev.ipynb 33
|
||||
def create_data_pipeline(dataset_path: str = "data/cifar10/",
|
||||
batch_size: int = 32,
|
||||
normalize: bool = True,
|
||||
shuffle: bool = True):
|
||||
"""Create a complete data pipeline for training."""
|
||||
|
||||
print("🔧 Creating data pipeline...")
|
||||
|
||||
# Create datasets with real CIFAR-10 data
|
||||
train_dataset = CIFAR10Dataset(dataset_path, train=True, download=True)
|
||||
test_dataset = CIFAR10Dataset(dataset_path, train=False, download=True)
|
||||
|
||||
# Create data loaders
|
||||
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
|
||||
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
|
||||
|
||||
# Create normalizer
|
||||
normalizer = None
|
||||
if normalize:
|
||||
normalizer = Normalizer()
|
||||
# Fit on a subset of training data for efficiency
|
||||
sample_data = [train_dataset[i][0] for i in range(min(1000, len(train_dataset)))]
|
||||
normalizer.fit(sample_data)
|
||||
print(f"✅ Computed normalization stats: mean={normalizer.mean:.4f}, std={normalizer.std:.4f}")
|
||||
|
||||
print(f"✅ Pipeline created:")
|
||||
print(f" - Training batches: {len(train_loader)}")
|
||||
print(f" - Test batches: {len(test_loader)}")
|
||||
print(f" - Batch size: {batch_size}")
|
||||
print(f" - Normalization: {normalize}")
|
||||
|
||||
return train_loader, test_loader
|
||||
Reference in New Issue
Block a user