TinyTorch/tinytorch/core/dataloader.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/08_dataloader/dataloader_dev.ipynb.

# %% auto 0
__all__ = ['Dataset', 'DataLoader', 'SimpleDataset', 'download_cifar10', 'CIFAR10Dataset']

# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 1
import numpy as np
import sys
import os
from typing import Tuple, Optional, Iterator
import urllib.request
import tarfile
import pickle
import time

# Import our building blocks - try package first, then local modules
try:
    from tinytorch.core.tensor import Tensor
except ImportError:
    # For development, import from local modules
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
    from tensor_dev import Tensor

# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 7
class Dataset:
    """
    Base Dataset class: Abstract interface for all datasets.

    The fundamental abstraction for data loading in TinyTorch.
    Students implement concrete datasets by inheriting from this class.
    """

    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
        """
        Get a single sample and label by index.

        Args:
            index: Index of the sample to retrieve

        Returns:
            Tuple of (data, label) tensors

        TODO: Implement abstract method for getting samples.

        STEP-BY-STEP IMPLEMENTATION:
        1. This is an abstract method - subclasses will implement it
        2. Return a tuple of (data, label) tensors
        3. Data should be the input features, label should be the target

        EXAMPLE:
        dataset[0] should return (Tensor(image_data), Tensor(label))

        LEARNING CONNECTIONS:
        - **PyTorch Integration**: This follows the exact same pattern as torch.utils.data.Dataset
        - **Production Data**: Real datasets like ImageNet, CIFAR-10 use this interface
        - **Memory Efficiency**: On-demand loading prevents loading entire dataset into memory
        - **Batching Foundation**: DataLoader uses __getitem__ to create batches efficiently

        HINTS:
        - This is an abstract method that subclasses must override
        - Always return a tuple of (data, label) tensors
        - Data contains the input features, label contains the target
        """
        ### BEGIN SOLUTION
        # This is an abstract method - subclasses must implement it
        raise NotImplementedError("Subclasses must implement __getitem__")
        ### END SOLUTION

    def __len__(self) -> int:
        """
        Get the total number of samples in the dataset.

        TODO: Implement abstract method for getting dataset size.

        STEP-BY-STEP IMPLEMENTATION:
        1. This is an abstract method - subclasses will implement it
        2. Return the total number of samples in the dataset

        EXAMPLE:
        len(dataset) should return 50000 for CIFAR-10 training set

        LEARNING CONNECTIONS:
        - **Memory Planning**: DataLoader uses len() to calculate number of batches
        - **Progress Tracking**: Training loops use len() for progress bars and epoch calculations
        - **Distributed Training**: Multi-GPU systems need dataset size for work distribution
        - **Statistical Sampling**: Some training strategies require knowing total dataset size

        HINTS:
        - This is an abstract method that subclasses must override
        - Return an integer representing the total number of samples
        """
        ### BEGIN SOLUTION
        # This is an abstract method - subclasses must implement it
        raise NotImplementedError("Subclasses must implement __len__")
        ### END SOLUTION

    def get_sample_shape(self) -> Tuple[int, ...]:
        """
        Get the shape of a single data sample.

        TODO: Implement method to get sample shape.

        STEP-BY-STEP IMPLEMENTATION:
        1. Get the first sample using self[0]
        2. Extract the data part (first element of tuple)
        3. Return the shape of the data tensor

        EXAMPLE:
        For CIFAR-10: returns (3, 32, 32) for RGB images

        LEARNING CONNECTIONS:
        - **Model Architecture**: Neural networks need to know input shape for first layer
        - **Batch Planning**: Systems use sample shape to calculate memory requirements
        - **Preprocessing Validation**: Ensures all samples have consistent shape
        - **Framework Integration**: Similar to PyTorch's dataset shape inspection

        HINTS:
        - Use self[0] to get the first sample
        - Extract data from the (data, label) tuple
        - Return data.shape
        """
        ### BEGIN SOLUTION
        # Get the first sample to determine shape
        data, _ = self[0]
        return data.shape
        ### END SOLUTION

    def get_num_classes(self) -> int:
        """
        Get the number of classes in the dataset.

        TODO: Implement abstract method for getting number of classes.

        STEP-BY-STEP IMPLEMENTATION:
        1. This is an abstract method - subclasses will implement it
        2. Return the number of unique classes in the dataset

        EXAMPLE:
        For CIFAR-10: returns 10 (classes 0-9)

        LEARNING CONNECTIONS:
        - **Output Layer Design**: Neural networks need num_classes for final layer size
        - **Loss Function Setup**: CrossEntropyLoss uses num_classes for proper computation
        - **Evaluation Metrics**: Accuracy calculation depends on number of classes
        - **Model Validation**: Ensures model predictions match expected class range

        HINTS:
        - This is an abstract method that subclasses must override
        - Return the number of unique classes/categories
        """
        # This is an abstract method - subclasses must implement it
        raise NotImplementedError("Subclasses must implement get_num_classes")

# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 11
class DataLoader:
    """
    DataLoader: Efficiently batch and iterate through datasets.

    Provides batching, shuffling, and efficient iteration over datasets.
    Essential for training neural networks efficiently.
    """

    def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
        """
        Initialize DataLoader.

        Args:
            dataset: Dataset to load from
            batch_size: Number of samples per batch
            shuffle: Whether to shuffle data each epoch

        TODO: Store configuration and dataset.

        APPROACH:
        1. Store dataset as self.dataset
        2. Store batch_size as self.batch_size
        3. Store shuffle as self.shuffle

        EXAMPLE:
        DataLoader(dataset, batch_size=32, shuffle=True)

        HINTS:
        - Store all parameters as instance variables
        - These will be used in __iter__ for batching
        """
        # Input validation
        if dataset is None:
            raise TypeError("Dataset cannot be None")
        if not isinstance(batch_size, int) or batch_size <= 0:
            raise ValueError(f"Batch size must be a positive integer, got {batch_size}")

        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle

    def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
        """
        Iterate through dataset in batches.

        Returns:
            Iterator yielding (batch_data, batch_labels) tuples

        TODO: Implement batching and shuffling logic.

        STEP-BY-STEP IMPLEMENTATION:
        1. Create indices list: list(range(len(dataset)))
        2. Shuffle indices if self.shuffle is True
        3. Loop through indices in batch_size chunks
        4. For each batch: collect samples, stack them, yield batch

        EXAMPLE:
        for batch_data, batch_labels in dataloader:
            # batch_data.shape: (batch_size, ...)
            # batch_labels.shape: (batch_size,)

        LEARNING CONNECTIONS:
        - **GPU Efficiency**: Batching maximizes GPU utilization by processing multiple samples together
        - **Training Stability**: Shuffling prevents overfitting to data order and improves generalization
        - **Memory Management**: Batches fit in GPU memory while full dataset may not
        - **Gradient Estimation**: Batch gradients provide better estimates than single-sample gradients

        HINTS:
        - Use list(range(len(self.dataset))) for indices
        - Use np.random.shuffle() if self.shuffle is True
        - Loop in chunks of self.batch_size
        - Collect samples and stack with np.stack()
        """
        # Create indices for all samples
        indices = list(range(len(self.dataset)))

        # Shuffle if requested
        if self.shuffle:
            np.random.shuffle(indices)

        # Iterate through indices in batches
        for i in range(0, len(indices), self.batch_size):
            batch_indices = indices[i:i + self.batch_size]

            # Collect samples for this batch
            batch_data = []
            batch_labels = []

            for idx in batch_indices:
                data, label = self.dataset[idx]
                batch_data.append(data.data)
                batch_labels.append(label.data)

            # Stack into batch tensors
            batch_data_array = np.stack(batch_data, axis=0)
            batch_labels_array = np.stack(batch_labels, axis=0)

            yield Tensor(batch_data_array), Tensor(batch_labels_array)

    def __len__(self) -> int:
        """
        Get the number of batches per epoch.

        TODO: Calculate number of batches.

        APPROACH:
        1. Get dataset size: len(self.dataset)
        2. Divide by batch_size and round up
        3. Use ceiling division: (n + batch_size - 1) // batch_size

        EXAMPLE:
        Dataset size 100, batch size 32 → 4 batches

        HINTS:
        - Use len(self.dataset) for dataset size
        - Use ceiling division for exact batch count
        - Formula: (dataset_size + batch_size - 1) // batch_size
        """
        # Calculate number of batches using ceiling division
        dataset_size = len(self.dataset)
        return (dataset_size + self.batch_size - 1) // self.batch_size

# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 15
class SimpleDataset(Dataset):
    """
    Simple dataset for testing and demonstration.

    Generates synthetic data with configurable size and properties.
    Perfect for understanding the Dataset pattern.
    """

    def __init__(self, size: int = 100, num_features: int = 4, num_classes: int = 3):
        """
        Initialize SimpleDataset.

        Args:
            size: Number of samples in the dataset
            num_features: Number of features per sample
            num_classes: Number of classes

        TODO: Initialize the dataset with synthetic data.

        APPROACH:
        1. Store the configuration parameters
        2. Generate synthetic data and labels
        3. Make data deterministic for testing

        EXAMPLE:
        SimpleDataset(size=100, num_features=4, num_classes=3)
        creates 100 samples with 4 features each, 3 classes

        HINTS:
        - Store size, num_features, num_classes as instance variables
        - Use np.random.seed() for reproducible data
        - Generate random data with np.random.randn()
        - Generate random labels with np.random.randint()
        """
        self.size = size
        self.num_features = num_features
        self.num_classes = num_classes

        # Generate synthetic data (deterministic for testing)
        np.random.seed(42)  # For reproducible data
        self.data = np.random.randn(size, num_features).astype(np.float32)
        self.labels = np.random.randint(0, num_classes, size=size)

    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
        """
        Get a sample by index.

        Args:
            index: Index of the sample

        Returns:
            Tuple of (data, label) tensors

        TODO: Return the sample at the given index.

        APPROACH:
        1. Get data sample from self.data[index]
        2. Get label from self.labels[index]
        3. Convert both to Tensors and return as tuple

        EXAMPLE:
        dataset[0] returns (Tensor(features), Tensor(label))

        HINTS:
        - Use self.data[index] for the data
        - Use self.labels[index] for the label
        - Convert to Tensors: Tensor(data), Tensor(label)
        """
        data = self.data[index]
        label = self.labels[index]
        return Tensor(data), Tensor(label)

    def __len__(self) -> int:
        """
        Get the dataset size.

        TODO: Return the dataset size.

        APPROACH:
        1. Return self.size

        EXAMPLE:
        len(dataset) returns 100 for dataset with 100 samples

        HINTS:
        - Simply return self.size
        """
        return self.size

    def get_num_classes(self) -> int:
        """
        Get the number of classes.

        TODO: Return the number of classes.

        APPROACH:
        1. Return self.num_classes

        EXAMPLE:
        dataset.get_num_classes() returns 3 for 3-class dataset

        HINTS:
        - Simply return self.num_classes
        """
        return self.num_classes

# %% ../../modules/source/08_dataloader/dataloader_dev.ipynb 17
def download_cifar10(root: str = "./data") -> str:
    """
    Download CIFAR-10 dataset.

    TODO: Download and extract CIFAR-10.

    HINTS:
    - URL: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
    - Use urllib.request.urlretrieve()
    - Extract with tarfile
    """
    ### BEGIN SOLUTION
    os.makedirs(root, exist_ok=True)
    dataset_dir = os.path.join(root, "cifar-10-batches-py")

    if os.path.exists(dataset_dir):
        print(f"✅ CIFAR-10 found at {dataset_dir}")
        return dataset_dir

    url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
    tar_path = os.path.join(root, "cifar-10.tar.gz")

    print(f"📥 Downloading CIFAR-10 (~170MB)...")
    urllib.request.urlretrieve(url, tar_path)
    print("✅ Downloaded!")

    print("📦 Extracting...")
    with tarfile.open(tar_path, 'r:gz') as tar:
        tar.extractall(root)
    print("✅ Ready!")

    return dataset_dir
    ### END SOLUTION

class CIFAR10Dataset(Dataset):
    """CIFAR-10 dataset for CNN training."""

    def __init__(self, root="./data", train=True, download=False):
        """Load CIFAR-10 data."""
        ### BEGIN SOLUTION
        if download:
            dataset_dir = download_cifar10(root)
        else:
            dataset_dir = os.path.join(root, "cifar-10-batches-py")

        if train:
            data_list = []
            label_list = []
            for i in range(1, 6):
                with open(os.path.join(dataset_dir, f"data_batch_{i}"), 'rb') as f:
                    batch = pickle.load(f, encoding='bytes')
                    data_list.append(batch[b'data'])
                    label_list.extend(batch[b'labels'])
            self.data = np.concatenate(data_list)
            self.labels = np.array(label_list)
        else:
            with open(os.path.join(dataset_dir, "test_batch"), 'rb') as f:
                batch = pickle.load(f, encoding='bytes')
                self.data = batch[b'data']
                self.labels = np.array(batch[b'labels'])

        # Reshape to (N, 3, 32, 32) and normalize
        self.data = self.data.reshape(-1, 3, 32, 32).astype(np.float32) / 255.0
        print(f"✅ Loaded {len(self.data):,} images")
        ### END SOLUTION

    def __getitem__(self, idx):
        return Tensor(self.data[idx]), Tensor(self.labels[idx])

    def __len__(self):
        return len(self.data)

    def get_num_classes(self):
        return 10