mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 15:52:35 -05:00
- Migrated all Python source files to assignments/source/ structure - Updated nbdev configuration to use assignments/source as nbs_path - Updated all tito commands (nbgrader, export, test) to use new structure - Fixed hardcoded paths in Python files and documentation - Updated config.py to use assignments/source instead of modules - Fixed test command to use correct file naming (short names vs full module names) - Regenerated all notebook files with clean metadata - Verified complete workflow: Python source → NBGrader → nbdev export → testing All systems now working: NBGrader (14 source assignments, 1 released), nbdev export (7 generated files), and pytest integration. The modules/ directory has been retired and replaced with standard NBGrader structure.
695 lines
25 KiB
Python
695 lines
25 KiB
Python
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../assignments/source/06_dataloader/dataloader_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['Dataset', 'CIFAR10Dataset', 'DataLoader', 'Normalizer', 'create_data_pipeline']
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 3
|
|
import numpy as np
|
|
import sys
|
|
import os
|
|
import pickle
|
|
import struct
|
|
from typing import List, Tuple, Optional, Union, Iterator
|
|
import matplotlib.pyplot as plt
|
|
import urllib.request
|
|
import tarfile
|
|
|
|
# Import our building blocks
|
|
from .tensor import Tensor
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 4
|
|
def _should_show_plots():
|
|
"""Check if we should show plots (disable during testing)"""
|
|
return 'pytest' not in sys.modules and 'test' not in sys.argv
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 6
|
|
class Dataset:
|
|
"""
|
|
Base Dataset class: Abstract interface for all datasets.
|
|
|
|
The fundamental abstraction for data loading in TinyTorch.
|
|
Students implement concrete datasets by inheriting from this class.
|
|
|
|
TODO: Implement the base Dataset class with required methods.
|
|
|
|
APPROACH:
|
|
1. Define the interface that all datasets must implement
|
|
2. Include methods for getting individual samples and dataset size
|
|
3. Make it easy to extend for different data types
|
|
|
|
EXAMPLE:
|
|
dataset = CIFAR10Dataset("data/cifar10/")
|
|
sample, label = dataset[0] # Get first sample
|
|
size = len(dataset) # Get dataset size
|
|
|
|
HINTS:
|
|
- Use abstract methods that subclasses must implement
|
|
- Include __getitem__ for indexing and __len__ for size
|
|
- Add helper methods for getting sample shapes and number of classes
|
|
"""
|
|
|
|
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
|
"""
|
|
Get a single sample and label by index.
|
|
|
|
Args:
|
|
index: Index of the sample to retrieve
|
|
|
|
Returns:
|
|
Tuple of (data, label) tensors
|
|
|
|
TODO: Implement abstract method for getting samples.
|
|
|
|
STEP-BY-STEP:
|
|
1. This is an abstract method - subclasses will implement it
|
|
2. Return a tuple of (data, label) tensors
|
|
3. Data should be the input features, label should be the target
|
|
|
|
EXAMPLE:
|
|
dataset[0] should return (Tensor(image_data), Tensor(label))
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def __len__(self) -> int:
|
|
"""
|
|
Get the total number of samples in the dataset.
|
|
|
|
TODO: Implement abstract method for getting dataset size.
|
|
|
|
STEP-BY-STEP:
|
|
1. This is an abstract method - subclasses will implement it
|
|
2. Return the total number of samples in the dataset
|
|
|
|
EXAMPLE:
|
|
len(dataset) should return 50000 for CIFAR-10 training set
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def get_sample_shape(self) -> Tuple[int, ...]:
|
|
"""
|
|
Get the shape of a single data sample.
|
|
|
|
TODO: Implement method to get sample shape.
|
|
|
|
STEP-BY-STEP:
|
|
1. Get the first sample using self[0]
|
|
2. Extract the data part (first element of tuple)
|
|
3. Return the shape of the data tensor
|
|
|
|
EXAMPLE:
|
|
For CIFAR-10: returns (3, 32, 32) for RGB images
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def get_num_classes(self) -> int:
|
|
"""
|
|
Get the number of classes in the dataset.
|
|
|
|
TODO: Implement abstract method for getting number of classes.
|
|
|
|
STEP-BY-STEP:
|
|
1. This is an abstract method - subclasses will implement it
|
|
2. Return the total number of classes in the dataset
|
|
|
|
EXAMPLE:
|
|
For CIFAR-10: returns 10 (airplane, car, bird, cat, deer, dog, frog, horse, ship, truck)
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 7
|
|
class Dataset:
|
|
"""Base Dataset class: Abstract interface for all datasets."""
|
|
|
|
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
|
"""Get a single sample and label by index."""
|
|
raise NotImplementedError("Subclasses must implement __getitem__")
|
|
|
|
def __len__(self) -> int:
|
|
"""Get the total number of samples in the dataset."""
|
|
raise NotImplementedError("Subclasses must implement __len__")
|
|
|
|
def get_sample_shape(self) -> Tuple[int, ...]:
|
|
"""Get the shape of a single data sample."""
|
|
sample, _ = self[0]
|
|
return sample.shape
|
|
|
|
def get_num_classes(self) -> int:
|
|
"""Get the number of classes in the dataset."""
|
|
raise NotImplementedError("Subclasses must implement get_num_classes")
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 11
|
|
class CIFAR10Dataset(Dataset):
|
|
"""
|
|
CIFAR-10 Dataset: Load and manage CIFAR-10 image data.
|
|
|
|
CIFAR-10 contains 60,000 32x32 color images in 10 classes.
|
|
Perfect for learning data loading and image processing.
|
|
|
|
Args:
|
|
root_dir: Directory containing CIFAR-10 files
|
|
train: If True, load training data. If False, load test data.
|
|
download: If True, download dataset if not present
|
|
|
|
TODO: Implement CIFAR-10 dataset loading.
|
|
|
|
APPROACH:
|
|
1. Handle dataset download if needed (with progress bar!)
|
|
2. Parse binary files to extract images and labels
|
|
3. Store data efficiently in memory
|
|
4. Implement indexing and size methods
|
|
|
|
EXAMPLE:
|
|
dataset = CIFAR10Dataset("data/cifar10/", train=True)
|
|
image, label = dataset[0] # Get first image
|
|
print(f"Image shape: {image.shape}") # (3, 32, 32)
|
|
print(f"Label: {label}") # Tensor with class index
|
|
|
|
HINTS:
|
|
- Use pickle to load binary files
|
|
- Each batch file contains 'data' and 'labels' keys
|
|
- Reshape data to (3, 32, 32) format
|
|
- Store images and labels as separate lists
|
|
- Add progress bar with urllib.request.urlretrieve(url, filename, reporthook=progress_function)
|
|
- Progress function receives (block_num, block_size, total_size) parameters
|
|
"""
|
|
|
|
def __init__(self, root_dir: str, train: bool = True, download: bool = True):
|
|
"""
|
|
Initialize CIFAR-10 dataset.
|
|
|
|
Args:
|
|
root_dir: Directory to store/load dataset
|
|
train: If True, load training data. If False, load test data.
|
|
download: If True, download dataset if not present
|
|
|
|
TODO: Implement CIFAR-10 initialization.
|
|
|
|
STEP-BY-STEP:
|
|
1. Create root directory if it doesn't exist
|
|
2. Download dataset if needed and not present (with progress bar!)
|
|
3. Load binary files and parse data
|
|
4. Store images and labels in memory
|
|
5. Set up class names
|
|
|
|
EXAMPLE:
|
|
CIFAR10Dataset("data/cifar10/", train=True)
|
|
creates a dataset with 50,000 training images
|
|
|
|
PROGRESS BAR HINT:
|
|
def show_progress(block_num, block_size, total_size):
|
|
downloaded = block_num * block_size
|
|
percent = (downloaded * 100) // total_size
|
|
print(f"\\rDownloading: {percent}%", end='', flush=True)
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
|
"""
|
|
Get a single image and label by index.
|
|
|
|
Args:
|
|
index: Index of the sample to retrieve
|
|
|
|
Returns:
|
|
Tuple of (image, label) tensors
|
|
|
|
TODO: Implement sample retrieval.
|
|
|
|
STEP-BY-STEP:
|
|
1. Get image from self.images[index]
|
|
2. Get label from self.labels[index]
|
|
3. Return (Tensor(image), Tensor(label))
|
|
|
|
EXAMPLE:
|
|
image, label = dataset[0]
|
|
image.shape should be (3, 32, 32)
|
|
label should be integer 0-9
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def __len__(self) -> int:
|
|
"""
|
|
Get the total number of samples in the dataset.
|
|
|
|
TODO: Return the length of the dataset.
|
|
|
|
STEP-BY-STEP:
|
|
1. Return len(self.images)
|
|
|
|
EXAMPLE:
|
|
Training set: 50,000 samples
|
|
Test set: 10,000 samples
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def get_num_classes(self) -> int:
|
|
"""
|
|
Get the number of classes in CIFAR-10.
|
|
|
|
TODO: Return the number of classes.
|
|
|
|
STEP-BY-STEP:
|
|
1. CIFAR-10 has 10 classes
|
|
2. Return 10
|
|
|
|
EXAMPLE:
|
|
Returns 10 for CIFAR-10
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 12
|
|
class CIFAR10Dataset(Dataset):
|
|
"""CIFAR-10 Dataset: Load and manage CIFAR-10 image data."""
|
|
|
|
def __init__(self, root_dir: str, train: bool = True, download: bool = True):
|
|
self.root_dir = root_dir
|
|
self.train = train
|
|
self.class_names = ['airplane', 'car', 'bird', 'cat', 'deer',
|
|
'dog', 'frog', 'horse', 'ship', 'truck']
|
|
|
|
# Create directory if it doesn't exist
|
|
os.makedirs(root_dir, exist_ok=True)
|
|
|
|
# Download if needed
|
|
if download:
|
|
self._download_if_needed()
|
|
|
|
# Load data
|
|
self._load_data()
|
|
|
|
def _download_if_needed(self):
|
|
"""Download CIFAR-10 if not present."""
|
|
cifar_path = os.path.join(self.root_dir, "cifar-10-batches-py")
|
|
if not os.path.exists(cifar_path):
|
|
print("🔄 Downloading CIFAR-10 dataset...")
|
|
print("📦 Size: ~170MB (this may take a few minutes)")
|
|
url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
|
|
filename = os.path.join(self.root_dir, "cifar-10-python.tar.gz")
|
|
|
|
try:
|
|
# Download with progress bar
|
|
def show_progress(block_num, block_size, total_size):
|
|
"""Show download progress bar."""
|
|
downloaded = block_num * block_size
|
|
if total_size > 0:
|
|
percent = min(100, (downloaded * 100) // total_size)
|
|
bar_length = 50
|
|
filled_length = (percent * bar_length) // 100
|
|
bar = '█' * filled_length + '░' * (bar_length - filled_length)
|
|
|
|
# Convert bytes to MB
|
|
downloaded_mb = downloaded / (1024 * 1024)
|
|
total_mb = total_size / (1024 * 1024)
|
|
|
|
print(f"\r📥 [{bar}] {percent}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)", end='', flush=True)
|
|
else:
|
|
# Fallback if total size unknown
|
|
downloaded_mb = downloaded / (1024 * 1024)
|
|
print(f"\r📥 Downloaded: {downloaded_mb:.1f} MB", end='', flush=True)
|
|
|
|
urllib.request.urlretrieve(url, filename, reporthook=show_progress)
|
|
print() # New line after progress bar
|
|
|
|
# Extract
|
|
print("📂 Extracting CIFAR-10 files...")
|
|
with tarfile.open(filename, 'r:gz') as tar:
|
|
tar.extractall(self.root_dir, filter='data')
|
|
|
|
# Clean up
|
|
os.remove(filename)
|
|
print("✅ CIFAR-10 downloaded and extracted successfully!")
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ Download failed: {e}")
|
|
print("Please download CIFAR-10 manually from https://www.cs.toronto.edu/~kriz/cifar.html")
|
|
|
|
def _load_data(self):
|
|
"""Load CIFAR-10 data from binary files."""
|
|
cifar_path = os.path.join(self.root_dir, "cifar-10-batches-py")
|
|
|
|
self.images = []
|
|
self.labels = []
|
|
|
|
if self.train:
|
|
# Load training batches
|
|
for i in range(1, 6):
|
|
batch_file = os.path.join(cifar_path, f"data_batch_{i}")
|
|
if os.path.exists(batch_file):
|
|
with open(batch_file, 'rb') as f:
|
|
batch = pickle.load(f, encoding='bytes')
|
|
# Convert bytes keys to strings
|
|
batch = {k.decode('utf-8') if isinstance(k, bytes) else k: v for k, v in batch.items()}
|
|
|
|
# Extract images and labels
|
|
images = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
|
|
labels = batch['labels']
|
|
|
|
self.images.extend(images)
|
|
self.labels.extend(labels)
|
|
else:
|
|
# Load test batch
|
|
test_file = os.path.join(cifar_path, "test_batch")
|
|
if os.path.exists(test_file):
|
|
with open(test_file, 'rb') as f:
|
|
batch = pickle.load(f, encoding='bytes')
|
|
# Convert bytes keys to strings
|
|
batch = {k.decode('utf-8') if isinstance(k, bytes) else k: v for k, v in batch.items()}
|
|
|
|
# Extract images and labels
|
|
self.images = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
|
|
self.labels = batch['labels']
|
|
|
|
print(f"✅ Loaded {len(self.images)} {'training' if self.train else 'test'} samples")
|
|
|
|
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
|
|
"""Get a single image and label by index."""
|
|
image = Tensor(self.images[index])
|
|
label = Tensor(np.array(self.labels[index]))
|
|
return image, label
|
|
|
|
def __len__(self) -> int:
|
|
"""Get the total number of samples in the dataset."""
|
|
return len(self.images)
|
|
|
|
def get_num_classes(self) -> int:
|
|
"""Get the number of classes in CIFAR-10."""
|
|
return 10
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 20
|
|
class DataLoader:
|
|
"""
|
|
DataLoader: Efficiently batch and iterate through datasets.
|
|
|
|
Provides batching, shuffling, and efficient iteration over datasets.
|
|
Essential for training neural networks efficiently.
|
|
|
|
Args:
|
|
dataset: Dataset to load from
|
|
batch_size: Number of samples per batch
|
|
shuffle: Whether to shuffle data each epoch
|
|
|
|
TODO: Implement DataLoader with batching and shuffling.
|
|
|
|
APPROACH:
|
|
1. Store dataset and configuration
|
|
2. Implement __iter__ to yield batches
|
|
3. Handle shuffling and batching logic
|
|
4. Stack individual samples into batches
|
|
|
|
EXAMPLE:
|
|
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
|
|
for batch_images, batch_labels in dataloader:
|
|
print(f"Batch shape: {batch_images.shape}") # (32, 3, 32, 32)
|
|
|
|
HINTS:
|
|
- Use np.random.permutation for shuffling
|
|
- Stack samples using np.stack
|
|
- Yield batches as (batch_data, batch_labels)
|
|
- Handle last batch that might be smaller
|
|
"""
|
|
|
|
def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
|
|
"""
|
|
Initialize DataLoader.
|
|
|
|
Args:
|
|
dataset: Dataset to load from
|
|
batch_size: Number of samples per batch
|
|
shuffle: Whether to shuffle data each epoch
|
|
|
|
TODO: Store configuration and dataset.
|
|
|
|
STEP-BY-STEP:
|
|
1. Store dataset as self.dataset
|
|
2. Store batch_size as self.batch_size
|
|
3. Store shuffle as self.shuffle
|
|
|
|
EXAMPLE:
|
|
DataLoader(dataset, batch_size=32, shuffle=True)
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
|
|
"""
|
|
Iterate through dataset in batches.
|
|
|
|
Returns:
|
|
Iterator yielding (batch_data, batch_labels) tuples
|
|
|
|
TODO: Implement batching and shuffling logic.
|
|
|
|
STEP-BY-STEP:
|
|
1. Create indices list: list(range(len(dataset)))
|
|
2. Shuffle indices if self.shuffle is True
|
|
3. Loop through indices in batch_size chunks
|
|
4. For each batch: collect samples, stack them, yield batch
|
|
|
|
EXAMPLE:
|
|
for batch_data, batch_labels in dataloader:
|
|
# batch_data.shape: (batch_size, ...)
|
|
# batch_labels.shape: (batch_size,)
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def __len__(self) -> int:
|
|
"""
|
|
Get the number of batches per epoch.
|
|
|
|
TODO: Calculate number of batches.
|
|
|
|
STEP-BY-STEP:
|
|
1. Get dataset size: len(self.dataset)
|
|
2. Calculate: (dataset_size + batch_size - 1) // batch_size
|
|
3. This handles the last partial batch correctly
|
|
|
|
EXAMPLE:
|
|
Dataset size: 100, batch_size: 32
|
|
Number of batches: 4 (32, 32, 32, 4)
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 21
|
|
class DataLoader:
|
|
"""DataLoader: Efficiently batch and iterate through datasets."""
|
|
|
|
def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
|
|
self.dataset = dataset
|
|
self.batch_size = batch_size
|
|
self.shuffle = shuffle
|
|
|
|
def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
|
|
"""Iterate through dataset in batches."""
|
|
# Create indices
|
|
indices = list(range(len(self.dataset)))
|
|
|
|
# Shuffle if requested
|
|
if self.shuffle:
|
|
np.random.shuffle(indices)
|
|
|
|
# Generate batches
|
|
for i in range(0, len(indices), self.batch_size):
|
|
batch_indices = indices[i:i + self.batch_size]
|
|
|
|
# Collect samples for this batch
|
|
batch_data = []
|
|
batch_labels = []
|
|
|
|
for idx in batch_indices:
|
|
data, label = self.dataset[idx]
|
|
batch_data.append(data.data)
|
|
batch_labels.append(label.data)
|
|
|
|
# Stack into batches
|
|
batch_data = np.stack(batch_data, axis=0)
|
|
batch_labels = np.stack(batch_labels, axis=0)
|
|
|
|
yield Tensor(batch_data), Tensor(batch_labels)
|
|
|
|
def __len__(self) -> int:
|
|
"""Get the number of batches per epoch."""
|
|
return (len(self.dataset) + self.batch_size - 1) // self.batch_size
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 26
|
|
class Normalizer:
|
|
"""
|
|
Data Normalizer: Standardize data for better training.
|
|
|
|
Computes mean and standard deviation from training data,
|
|
then applies normalization to new data.
|
|
|
|
TODO: Implement data normalization.
|
|
|
|
APPROACH:
|
|
1. Fit: Compute mean and std from training data
|
|
2. Transform: Apply normalization using computed stats
|
|
3. Handle both single tensors and batches
|
|
|
|
EXAMPLE:
|
|
normalizer = Normalizer()
|
|
normalizer.fit(training_data) # Compute stats
|
|
normalized = normalizer.transform(new_data) # Apply normalization
|
|
|
|
HINTS:
|
|
- Store mean and std as instance variables
|
|
- Use np.mean and np.std for statistics
|
|
- Apply: (data - mean) / std
|
|
- Handle division by zero (add small epsilon)
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Initialize normalizer.
|
|
|
|
TODO: Initialize mean and std to None.
|
|
|
|
STEP-BY-STEP:
|
|
1. Set self.mean = None
|
|
2. Set self.std = None
|
|
3. Set self.epsilon = 1e-8 (for numerical stability)
|
|
|
|
EXAMPLE:
|
|
normalizer = Normalizer()
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def fit(self, data: List[Tensor]):
|
|
"""
|
|
Compute normalization statistics from training data.
|
|
|
|
Args:
|
|
data: List of tensors to compute statistics from
|
|
|
|
TODO: Compute mean and standard deviation.
|
|
|
|
STEP-BY-STEP:
|
|
1. Stack all tensors: np.stack([t.data for t in data])
|
|
2. Compute mean: np.mean(stacked_data)
|
|
3. Compute std: np.std(stacked_data)
|
|
4. Store as self.mean and self.std
|
|
|
|
EXAMPLE:
|
|
normalizer.fit([tensor1, tensor2, tensor3])
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
def transform(self, data: Union[Tensor, List[Tensor]]) -> Union[Tensor, List[Tensor]]:
|
|
"""
|
|
Apply normalization to data.
|
|
|
|
Args:
|
|
data: Tensor or list of tensors to normalize
|
|
|
|
Returns:
|
|
Normalized tensor(s)
|
|
|
|
TODO: Apply normalization using computed statistics.
|
|
|
|
STEP-BY-STEP:
|
|
1. Check if mean and std are computed (not None)
|
|
2. If single tensor: apply (data - mean) / (std + epsilon)
|
|
3. If list: apply to each tensor in the list
|
|
4. Return normalized data
|
|
|
|
EXAMPLE:
|
|
normalized = normalizer.transform(tensor)
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 27
|
|
class Normalizer:
|
|
"""Data Normalizer: Standardize data for better training."""
|
|
|
|
def __init__(self):
|
|
self.mean = None
|
|
self.std = None
|
|
self.epsilon = 1e-8
|
|
|
|
def fit(self, data: List[Tensor]):
|
|
"""Compute normalization statistics from training data."""
|
|
# Stack all data
|
|
all_data = np.stack([t.data for t in data])
|
|
|
|
# Compute statistics
|
|
self.mean = np.mean(all_data)
|
|
self.std = np.std(all_data)
|
|
|
|
print(f"✅ Computed normalization stats: mean={self.mean:.4f}, std={self.std:.4f}")
|
|
|
|
def transform(self, data: Union[Tensor, List[Tensor]]) -> Union[Tensor, List[Tensor]]:
|
|
"""Apply normalization to data."""
|
|
if self.mean is None or self.std is None:
|
|
raise ValueError("Must call fit() before transform()")
|
|
|
|
if isinstance(data, list):
|
|
# Transform list of tensors
|
|
return [Tensor((t.data - self.mean) / (self.std + self.epsilon)) for t in data]
|
|
else:
|
|
# Transform single tensor
|
|
return Tensor((data.data - self.mean) / (self.std + self.epsilon))
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 31
|
|
def create_data_pipeline(dataset_path: str = "data/cifar10/",
|
|
batch_size: int = 32,
|
|
normalize: bool = True,
|
|
shuffle: bool = True):
|
|
"""
|
|
Create a complete data pipeline for training.
|
|
|
|
Args:
|
|
dataset_path: Path to dataset
|
|
batch_size: Batch size for training
|
|
normalize: Whether to normalize data
|
|
shuffle: Whether to shuffle data
|
|
|
|
Returns:
|
|
Tuple of (train_loader, test_loader)
|
|
|
|
TODO: Implement complete data pipeline.
|
|
|
|
APPROACH:
|
|
1. Create train and test datasets
|
|
2. Create data loaders
|
|
3. Fit normalizer on training data
|
|
4. Return all components
|
|
|
|
EXAMPLE:
|
|
train_loader, test_loader = create_data_pipeline()
|
|
for batch_data, batch_labels in train_loader:
|
|
# Ready for training!
|
|
"""
|
|
raise NotImplementedError("Student implementation required")
|
|
|
|
# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 32
|
|
def create_data_pipeline(dataset_path: str = "data/cifar10/",
|
|
batch_size: int = 32,
|
|
normalize: bool = True,
|
|
shuffle: bool = True):
|
|
"""Create a complete data pipeline for training."""
|
|
|
|
print("🔧 Creating data pipeline...")
|
|
|
|
# Create datasets with real CIFAR-10 data
|
|
train_dataset = CIFAR10Dataset(dataset_path, train=True, download=True)
|
|
test_dataset = CIFAR10Dataset(dataset_path, train=False, download=True)
|
|
|
|
# Create data loaders
|
|
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
|
|
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
|
|
|
|
# Create normalizer
|
|
normalizer = None
|
|
if normalize:
|
|
normalizer = Normalizer()
|
|
# Fit on a subset of training data for efficiency
|
|
sample_data = [train_dataset[i][0] for i in range(min(1000, len(train_dataset)))]
|
|
normalizer.fit(sample_data)
|
|
print(f"✅ Computed normalization stats: mean={normalizer.mean:.4f}, std={normalizer.std:.4f}")
|
|
|
|
print(f"✅ Pipeline created:")
|
|
print(f" - Training batches: {len(train_loader)}")
|
|
print(f" - Test batches: {len(test_loader)}")
|
|
print(f" - Batch size: {batch_size}")
|
|
print(f" - Normalization: {normalize}")
|
|
|
|
return train_loader, test_loader
|