# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/06_dataloader/dataloader_dev.ipynb. # %% auto 0 __all__ = ['Dataset', 'DataLoader', 'SimpleDataset'] # %% ../../modules/source/06_dataloader/dataloader_dev.ipynb 1 import numpy as np import sys import os import pickle import struct from typing import List, Tuple, Optional, Union, Iterator import matplotlib.pyplot as plt import urllib.request import tarfile # Import our building blocks - try package first, then local modules try: from tinytorch.core.tensor import Tensor except ImportError: # For development, import from local modules sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor')) from tensor_dev import Tensor # %% ../../modules/source/06_dataloader/dataloader_dev.ipynb 2 def _should_show_plots(): """Check if we should show plots (disable during testing)""" # Check multiple conditions that indicate we're in test mode is_pytest = ( 'pytest' in sys.modules or 'test' in sys.argv or os.environ.get('PYTEST_CURRENT_TEST') is not None or any('test' in arg for arg in sys.argv) or any('pytest' in arg for arg in sys.argv) ) # Show plots in development mode (when not in test mode) return not is_pytest # %% ../../modules/source/06_dataloader/dataloader_dev.ipynb 7 class Dataset: """ Base Dataset class: Abstract interface for all datasets. The fundamental abstraction for data loading in TinyTorch. Students implement concrete datasets by inheriting from this class. """ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]: """ Get a single sample and label by index. Args: index: Index of the sample to retrieve Returns: Tuple of (data, label) tensors TODO: Implement abstract method for getting samples. APPROACH: 1. This is an abstract method - subclasses will implement it 2. Return a tuple of (data, label) tensors 3. Data should be the input features, label should be the target EXAMPLE: dataset[0] should return (Tensor(image_data), Tensor(label)) HINTS: - This is an abstract method that subclasses must override - Always return a tuple of (data, label) tensors - Data contains the input features, label contains the target """ ### BEGIN SOLUTION # This is an abstract method - subclasses must implement it raise NotImplementedError("Subclasses must implement __getitem__") ### END SOLUTION def __len__(self) -> int: """ Get the total number of samples in the dataset. TODO: Implement abstract method for getting dataset size. APPROACH: 1. This is an abstract method - subclasses will implement it 2. Return the total number of samples in the dataset EXAMPLE: len(dataset) should return 50000 for CIFAR-10 training set HINTS: - This is an abstract method that subclasses must override - Return an integer representing the total number of samples """ ### BEGIN SOLUTION # This is an abstract method - subclasses must implement it raise NotImplementedError("Subclasses must implement __len__") ### END SOLUTION def get_sample_shape(self) -> Tuple[int, ...]: """ Get the shape of a single data sample. TODO: Implement method to get sample shape. APPROACH: 1. Get the first sample using self[0] 2. Extract the data part (first element of tuple) 3. Return the shape of the data tensor EXAMPLE: For CIFAR-10: returns (3, 32, 32) for RGB images HINTS: - Use self[0] to get the first sample - Extract data from the (data, label) tuple - Return data.shape """ ### BEGIN SOLUTION # Get the first sample to determine shape data, _ = self[0] return data.shape ### END SOLUTION def get_num_classes(self) -> int: """ Get the number of classes in the dataset. TODO: Implement abstract method for getting number of classes. APPROACH: 1. This is an abstract method - subclasses will implement it 2. Return the number of unique classes in the dataset EXAMPLE: For CIFAR-10: returns 10 (classes 0-9) HINTS: - This is an abstract method that subclasses must override - Return the number of unique classes/categories """ ### BEGIN SOLUTION # This is an abstract method - subclasses must implement it raise NotImplementedError("Subclasses must implement get_num_classes") ### END SOLUTION # %% ../../modules/source/06_dataloader/dataloader_dev.ipynb 11 class DataLoader: """ DataLoader: Efficiently batch and iterate through datasets. Provides batching, shuffling, and efficient iteration over datasets. Essential for training neural networks efficiently. """ def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True): """ Initialize DataLoader. Args: dataset: Dataset to load from batch_size: Number of samples per batch shuffle: Whether to shuffle data each epoch TODO: Store configuration and dataset. APPROACH: 1. Store dataset as self.dataset 2. Store batch_size as self.batch_size 3. Store shuffle as self.shuffle EXAMPLE: DataLoader(dataset, batch_size=32, shuffle=True) HINTS: - Store all parameters as instance variables - These will be used in __iter__ for batching """ ### BEGIN SOLUTION self.dataset = dataset self.batch_size = batch_size self.shuffle = shuffle ### END SOLUTION def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]: """ Iterate through dataset in batches. Returns: Iterator yielding (batch_data, batch_labels) tuples TODO: Implement batching and shuffling logic. APPROACH: 1. Create indices list: list(range(len(dataset))) 2. Shuffle indices if self.shuffle is True 3. Loop through indices in batch_size chunks 4. For each batch: collect samples, stack them, yield batch EXAMPLE: for batch_data, batch_labels in dataloader: # batch_data.shape: (batch_size, ...) # batch_labels.shape: (batch_size,) HINTS: - Use list(range(len(self.dataset))) for indices - Use np.random.shuffle() if self.shuffle is True - Loop in chunks of self.batch_size - Collect samples and stack with np.stack() """ ### BEGIN SOLUTION # Create indices for all samples indices = list(range(len(self.dataset))) # Shuffle if requested if self.shuffle: np.random.shuffle(indices) # Iterate through indices in batches for i in range(0, len(indices), self.batch_size): batch_indices = indices[i:i + self.batch_size] # Collect samples for this batch batch_data = [] batch_labels = [] for idx in batch_indices: data, label = self.dataset[idx] batch_data.append(data.data) batch_labels.append(label.data) # Stack into batch tensors batch_data_array = np.stack(batch_data, axis=0) batch_labels_array = np.stack(batch_labels, axis=0) yield Tensor(batch_data_array), Tensor(batch_labels_array) ### END SOLUTION def __len__(self) -> int: """ Get the number of batches per epoch. TODO: Calculate number of batches. APPROACH: 1. Get dataset size: len(self.dataset) 2. Divide by batch_size and round up 3. Use ceiling division: (n + batch_size - 1) // batch_size EXAMPLE: Dataset size 100, batch size 32 → 4 batches HINTS: - Use len(self.dataset) for dataset size - Use ceiling division for exact batch count - Formula: (dataset_size + batch_size - 1) // batch_size """ ### BEGIN SOLUTION # Calculate number of batches using ceiling division dataset_size = len(self.dataset) return (dataset_size + self.batch_size - 1) // self.batch_size ### END SOLUTION # %% ../../modules/source/06_dataloader/dataloader_dev.ipynb 15 class SimpleDataset(Dataset): """ Simple dataset for testing and demonstration. Generates synthetic data with configurable size and properties. Perfect for understanding the Dataset pattern. """ def __init__(self, size: int = 100, num_features: int = 4, num_classes: int = 3): """ Initialize SimpleDataset. Args: size: Number of samples in the dataset num_features: Number of features per sample num_classes: Number of classes TODO: Initialize the dataset with synthetic data. APPROACH: 1. Store the configuration parameters 2. Generate synthetic data and labels 3. Make data deterministic for testing EXAMPLE: SimpleDataset(size=100, num_features=4, num_classes=3) creates 100 samples with 4 features each, 3 classes HINTS: - Store size, num_features, num_classes as instance variables - Use np.random.seed() for reproducible data - Generate random data with np.random.randn() - Generate random labels with np.random.randint() """ ### BEGIN SOLUTION self.size = size self.num_features = num_features self.num_classes = num_classes # Set seed for reproducible data np.random.seed(42) # Generate synthetic data self.data = np.random.randn(size, num_features).astype(np.float32) self.labels = np.random.randint(0, num_classes, size=size) ### END SOLUTION def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]: """ Get a single sample and label by index. Args: index: Index of the sample to retrieve Returns: Tuple of (data, label) tensors TODO: Return the sample and label at the given index. APPROACH: 1. Get data at index from self.data 2. Get label at index from self.labels 3. Convert to tensors and return as tuple EXAMPLE: dataset[0] returns (Tensor([1.2, -0.5, 0.8, 0.1]), Tensor(2)) HINTS: - Use self.data[index] and self.labels[index] - Convert to Tensor objects - Return as tuple (data, label) """ ### BEGIN SOLUTION data = Tensor(self.data[index]) label = Tensor(self.labels[index]) return data, label ### END SOLUTION def __len__(self) -> int: """ Get the total number of samples in the dataset. TODO: Return the dataset size. HINTS: - Return self.size """ ### BEGIN SOLUTION return self.size ### END SOLUTION def get_num_classes(self) -> int: """ Get the number of classes in the dataset. TODO: Return the number of classes. HINTS: - Return self.num_classes """ ### BEGIN SOLUTION return self.num_classes ### END SOLUTION