# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/07_dataloader/dataloader_dev.ipynb. # %% auto 0 __all__ = ['Dataset', 'DataLoader', 'SimpleDataset', 'download_cifar10', 'CIFAR10Dataset'] # %% ../../modules/source/07_dataloader/dataloader_dev.ipynb 1 import numpy as np import sys import os from typing import Tuple, Optional, Iterator import urllib.request import tarfile import pickle import time # Import our building blocks - try package first, then local modules try: from tinytorch.core.tensor import Tensor except ImportError: # For development, import from local modules sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor # %% ../../modules/source/07_dataloader/dataloader_dev.ipynb 7 class Dataset: """ Base Dataset class: Abstract interface for all datasets. The fundamental abstraction for data loading in TinyTorch. Students implement concrete datasets by inheriting from this class. """ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]: """ Get a single sample and label by index. Args: index: Index of the sample to retrieve Returns: Tuple of (data, label) tensors TODO: Implement abstract method for getting samples. STEP-BY-STEP IMPLEMENTATION: 1. This is an abstract method - subclasses will implement it 2. Return a tuple of (data, label) tensors 3. Data should be the input features, label should be the target EXAMPLE: dataset[0] should return (Tensor(image_data), Tensor(label)) LEARNING CONNECTIONS: - **PyTorch Integration**: This follows the exact same pattern as torch.utils.data.Dataset - **Production Data**: Real datasets like ImageNet, CIFAR-10 use this interface - **Memory Efficiency**: On-demand loading prevents loading entire dataset into memory - **Batching Foundation**: DataLoader uses __getitem__ to create batches efficiently HINTS: - This is an abstract method that subclasses must override - Always return a tuple of (data, label) tensors - Data contains the input features, label contains the target """ ### BEGIN SOLUTION # This is an abstract method - subclasses must implement it raise NotImplementedError("Subclasses must implement __getitem__") ### END SOLUTION def __len__(self) -> int: """ Get the total number of samples in the dataset. TODO: Implement abstract method for getting dataset size. STEP-BY-STEP IMPLEMENTATION: 1. This is an abstract method - subclasses will implement it 2. Return the total number of samples in the dataset EXAMPLE: len(dataset) should return 50000 for CIFAR-10 training set LEARNING CONNECTIONS: - **Memory Planning**: DataLoader uses len() to calculate number of batches - **Progress Tracking**: Training loops use len() for progress bars and epoch calculations - **Distributed Training**: Multi-GPU systems need dataset size for work distribution - **Statistical Sampling**: Some training strategies require knowing total dataset size HINTS: - This is an abstract method that subclasses must override - Return an integer representing the total number of samples """ ### BEGIN SOLUTION # This is an abstract method - subclasses must implement it raise NotImplementedError("Subclasses must implement __len__") ### END SOLUTION def get_sample_shape(self) -> Tuple[int, ...]: """ Get the shape of a single data sample. TODO: Implement method to get sample shape. STEP-BY-STEP IMPLEMENTATION: 1. Get the first sample using self[0] 2. Extract the data part (first element of tuple) 3. Return the shape of the data tensor EXAMPLE: For CIFAR-10: returns (3, 32, 32) for RGB images LEARNING CONNECTIONS: - **Model Architecture**: Neural networks need to know input shape for first layer - **Batch Planning**: Systems use sample shape to calculate memory requirements - **Preprocessing Validation**: Ensures all samples have consistent shape - **Framework Integration**: Similar to PyTorch's dataset shape inspection HINTS: - Use self[0] to get the first sample - Extract data from the (data, label) tuple - Return data.shape """ ### BEGIN SOLUTION # Get the first sample to determine shape data, _ = self[0] return data.shape ### END SOLUTION def get_num_classes(self) -> int: """ Get the number of classes in the dataset. TODO: Implement abstract method for getting number of classes. STEP-BY-STEP IMPLEMENTATION: 1. This is an abstract method - subclasses will implement it 2. Return the number of unique classes in the dataset EXAMPLE: For CIFAR-10: returns 10 (classes 0-9) LEARNING CONNECTIONS: - **Output Layer Design**: Neural networks need num_classes for final layer size - **Loss Function Setup**: CrossEntropyLoss uses num_classes for proper computation - **Evaluation Metrics**: Accuracy calculation depends on number of classes - **Model Validation**: Ensures model predictions match expected class range HINTS: - This is an abstract method that subclasses must override - Return the number of unique classes/categories """ # This is an abstract method - subclasses must implement it raise NotImplementedError("Subclasses must implement get_num_classes") # %% ../../modules/source/07_dataloader/dataloader_dev.ipynb 11 class DataLoader: """ DataLoader: Efficiently batch and iterate through datasets. Provides batching, shuffling, and efficient iteration over datasets. Essential for training neural networks efficiently. """ def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True): """ Initialize DataLoader. Args: dataset: Dataset to load from batch_size: Number of samples per batch shuffle: Whether to shuffle data each epoch TODO: Store configuration and dataset. APPROACH: 1. Store dataset as self.dataset 2. Store batch_size as self.batch_size 3. Store shuffle as self.shuffle EXAMPLE: DataLoader(dataset, batch_size=32, shuffle=True) HINTS: - Store all parameters as instance variables - These will be used in __iter__ for batching """ # Input validation if dataset is None: raise TypeError("Dataset cannot be None") if not isinstance(batch_size, int) or batch_size <= 0: raise ValueError(f"Batch size must be a positive integer, got {batch_size}") self.dataset = dataset self.batch_size = batch_size self.shuffle = shuffle def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]: """ Iterate through dataset in batches. Returns: Iterator yielding (batch_data, batch_labels) tuples TODO: Implement batching and shuffling logic. STEP-BY-STEP IMPLEMENTATION: 1. Create indices list: list(range(len(dataset))) 2. Shuffle indices if self.shuffle is True 3. Loop through indices in batch_size chunks 4. For each batch: collect samples, stack them, yield batch EXAMPLE: for batch_data, batch_labels in dataloader: # batch_data.shape: (batch_size, ...) # batch_labels.shape: (batch_size,) LEARNING CONNECTIONS: - **GPU Efficiency**: Batching maximizes GPU utilization by processing multiple samples together - **Training Stability**: Shuffling prevents overfitting to data order and improves generalization - **Memory Management**: Batches fit in GPU memory while full dataset may not - **Gradient Estimation**: Batch gradients provide better estimates than single-sample gradients HINTS: - Use list(range(len(self.dataset))) for indices - Use np.random.shuffle() if self.shuffle is True - Loop in chunks of self.batch_size - Collect samples and stack with np.stack() """ # Create indices for all samples indices = list(range(len(self.dataset))) # Shuffle if requested if self.shuffle: np.random.shuffle(indices) # Iterate through indices in batches for i in range(0, len(indices), self.batch_size): batch_indices = indices[i:i + self.batch_size] # Collect samples for this batch batch_data = [] batch_labels = [] for idx in batch_indices: data, label = self.dataset[idx] batch_data.append(data.data) batch_labels.append(label.data) # Stack into batch tensors batch_data_array = np.stack(batch_data, axis=0) batch_labels_array = np.stack(batch_labels, axis=0) yield Tensor(batch_data_array), Tensor(batch_labels_array) def __len__(self) -> int: """ Get the number of batches per epoch. TODO: Calculate number of batches. APPROACH: 1. Get dataset size: len(self.dataset) 2. Divide by batch_size and round up 3. Use ceiling division: (n + batch_size - 1) // batch_size EXAMPLE: Dataset size 100, batch size 32 → 4 batches HINTS: - Use len(self.dataset) for dataset size - Use ceiling division for exact batch count - Formula: (dataset_size + batch_size - 1) // batch_size """ # Calculate number of batches using ceiling division dataset_size = len(self.dataset) return (dataset_size + self.batch_size - 1) // self.batch_size # %% ../../modules/source/07_dataloader/dataloader_dev.ipynb 15 class SimpleDataset(Dataset): """ Simple dataset for testing and demonstration. Generates synthetic data with configurable size and properties. Perfect for understanding the Dataset pattern. """ def __init__(self, size: int = 100, num_features: int = 4, num_classes: int = 3): """ Initialize SimpleDataset. Args: size: Number of samples in the dataset num_features: Number of features per sample num_classes: Number of classes TODO: Initialize the dataset with synthetic data. APPROACH: 1. Store the configuration parameters 2. Generate synthetic data and labels 3. Make data deterministic for testing EXAMPLE: SimpleDataset(size=100, num_features=4, num_classes=3) creates 100 samples with 4 features each, 3 classes HINTS: - Store size, num_features, num_classes as instance variables - Use np.random.seed() for reproducible data - Generate random data with np.random.randn() - Generate random labels with np.random.randint() """ self.size = size self.num_features = num_features self.num_classes = num_classes # Generate synthetic data (deterministic for testing) np.random.seed(42) # For reproducible data self.data = np.random.randn(size, num_features).astype(np.float32) self.labels = np.random.randint(0, num_classes, size=size) def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]: """ Get a sample by index. Args: index: Index of the sample Returns: Tuple of (data, label) tensors TODO: Return the sample at the given index. APPROACH: 1. Get data sample from self.data[index] 2. Get label from self.labels[index] 3. Convert both to Tensors and return as tuple EXAMPLE: dataset[0] returns (Tensor(features), Tensor(label)) HINTS: - Use self.data[index] for the data - Use self.labels[index] for the label - Convert to Tensors: Tensor(data), Tensor(label) """ data = self.data[index] label = self.labels[index] return Tensor(data), Tensor(label) def __len__(self) -> int: """ Get the dataset size. TODO: Return the dataset size. APPROACH: 1. Return self.size EXAMPLE: len(dataset) returns 100 for dataset with 100 samples HINTS: - Simply return self.size """ return self.size def get_num_classes(self) -> int: """ Get the number of classes. TODO: Return the number of classes. APPROACH: 1. Return self.num_classes EXAMPLE: dataset.get_num_classes() returns 3 for 3-class dataset HINTS: - Simply return self.num_classes """ return self.num_classes # %% ../../modules/source/07_dataloader/dataloader_dev.ipynb 17 def download_cifar10(root: str = "./data") -> str: """ Download CIFAR-10 dataset. TODO: Download and extract CIFAR-10. HINTS: - URL: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz - Use urllib.request.urlretrieve() - Extract with tarfile """ ### BEGIN SOLUTION os.makedirs(root, exist_ok=True) dataset_dir = os.path.join(root, "cifar-10-batches-py") if os.path.exists(dataset_dir): print(f"✅ CIFAR-10 found at {dataset_dir}") return dataset_dir url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" tar_path = os.path.join(root, "cifar-10.tar.gz") print(f"📥 Downloading CIFAR-10 (~170MB)...") urllib.request.urlretrieve(url, tar_path) print("✅ Downloaded!") print("📦 Extracting...") with tarfile.open(tar_path, 'r:gz') as tar: tar.extractall(root) print("✅ Ready!") return dataset_dir ### END SOLUTION class CIFAR10Dataset(Dataset): """CIFAR-10 dataset for CNN training.""" def __init__(self, root="./data", train=True, download=False): """Load CIFAR-10 data.""" ### BEGIN SOLUTION if download: dataset_dir = download_cifar10(root) else: dataset_dir = os.path.join(root, "cifar-10-batches-py") if train: data_list = [] label_list = [] for i in range(1, 6): with open(os.path.join(dataset_dir, f"data_batch_{i}"), 'rb') as f: batch = pickle.load(f, encoding='bytes') data_list.append(batch[b'data']) label_list.extend(batch[b'labels']) self.data = np.concatenate(data_list) self.labels = np.array(label_list) else: with open(os.path.join(dataset_dir, "test_batch"), 'rb') as f: batch = pickle.load(f, encoding='bytes') self.data = batch[b'data'] self.labels = np.array(batch[b'labels']) # Reshape to (N, 3, 32, 32) and normalize self.data = self.data.reshape(-1, 3, 32, 32).astype(np.float32) / 255.0 print(f"✅ Loaded {len(self.data):,} images") ### END SOLUTION def __getitem__(self, idx): return Tensor(self.data[idx]), Tensor(self.labels[idx]) def __len__(self): return len(self.data) def get_num_classes(self): return 10