TinyTorch/tinytorch/core/dataloader.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../assignments/source/06_dataloader/dataloader_dev.ipynb.

# %% auto 0
__all__ = ['Dataset', 'CIFAR10Dataset', 'DataLoader', 'Normalizer', 'create_data_pipeline']

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 3
import numpy as np
import sys
import os
import pickle
import struct
from typing import List, Tuple, Optional, Union, Iterator
import matplotlib.pyplot as plt
import urllib.request
import tarfile

# Import our building blocks
from .tensor import Tensor

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 4
def _should_show_plots():
    """Check if we should show plots (disable during testing)"""
    return 'pytest' not in sys.modules and 'test' not in sys.argv

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 6
class Dataset:
    """
    Base Dataset class: Abstract interface for all datasets.

    The fundamental abstraction for data loading in TinyTorch.
    Students implement concrete datasets by inheriting from this class.

    TODO: Implement the base Dataset class with required methods.

    APPROACH:
    1. Define the interface that all datasets must implement
    2. Include methods for getting individual samples and dataset size
    3. Make it easy to extend for different data types

    EXAMPLE:
    dataset = CIFAR10Dataset("data/cifar10/")
    sample, label = dataset[0]  # Get first sample
    size = len(dataset)  # Get dataset size

    HINTS:
    - Use abstract methods that subclasses must implement
    - Include __getitem__ for indexing and __len__ for size
    - Add helper methods for getting sample shapes and number of classes
    """

    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
        """
        Get a single sample and label by index.

        Args:
            index: Index of the sample to retrieve

        Returns:
            Tuple of (data, label) tensors

        TODO: Implement abstract method for getting samples.

        STEP-BY-STEP:
        1. This is an abstract method - subclasses will implement it
        2. Return a tuple of (data, label) tensors
        3. Data should be the input features, label should be the target

        EXAMPLE:
        dataset[0] should return (Tensor(image_data), Tensor(label))
        """
        raise NotImplementedError("Student implementation required")

    def __len__(self) -> int:
        """
        Get the total number of samples in the dataset.

        TODO: Implement abstract method for getting dataset size.

        STEP-BY-STEP:
        1. This is an abstract method - subclasses will implement it
        2. Return the total number of samples in the dataset

        EXAMPLE:
        len(dataset) should return 50000 for CIFAR-10 training set
        """
        raise NotImplementedError("Student implementation required")

    def get_sample_shape(self) -> Tuple[int, ...]:
        """
        Get the shape of a single data sample.

        TODO: Implement method to get sample shape.

        STEP-BY-STEP:
        1. Get the first sample using self[0]
        2. Extract the data part (first element of tuple)
        3. Return the shape of the data tensor

        EXAMPLE:
        For CIFAR-10: returns (3, 32, 32) for RGB images
        """
        raise NotImplementedError("Student implementation required")

    def get_num_classes(self) -> int:
        """
        Get the number of classes in the dataset.

        TODO: Implement abstract method for getting number of classes.

        STEP-BY-STEP:
        1. This is an abstract method - subclasses will implement it
        2. Return the total number of classes in the dataset

        EXAMPLE:
        For CIFAR-10: returns 10 (airplane, car, bird, cat, deer, dog, frog, horse, ship, truck)
        """
        raise NotImplementedError("Student implementation required")

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 7
class Dataset:
    """Base Dataset class: Abstract interface for all datasets."""

    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
        """Get a single sample and label by index."""
        raise NotImplementedError("Subclasses must implement __getitem__")

    def __len__(self) -> int:
        """Get the total number of samples in the dataset."""
        raise NotImplementedError("Subclasses must implement __len__")

    def get_sample_shape(self) -> Tuple[int, ...]:
        """Get the shape of a single data sample."""
        sample, _ = self[0]
        return sample.shape

    def get_num_classes(self) -> int:
        """Get the number of classes in the dataset."""
        raise NotImplementedError("Subclasses must implement get_num_classes")

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 11
class CIFAR10Dataset(Dataset):
    """
    CIFAR-10 Dataset: Load and manage CIFAR-10 image data.

    CIFAR-10 contains 60,000 32x32 color images in 10 classes.
    Perfect for learning data loading and image processing.

    Args:
        root_dir: Directory containing CIFAR-10 files
        train: If True, load training data. If False, load test data.
        download: If True, download dataset if not present

    TODO: Implement CIFAR-10 dataset loading.

    APPROACH:
    1. Handle dataset download if needed (with progress bar!)
    2. Parse binary files to extract images and labels
    3. Store data efficiently in memory
    4. Implement indexing and size methods

    EXAMPLE:
    dataset = CIFAR10Dataset("data/cifar10/", train=True)
    image, label = dataset[0]  # Get first image
    print(f"Image shape: {image.shape}")  # (3, 32, 32)
    print(f"Label: {label}")  # Tensor with class index

    HINTS:
    - Use pickle to load binary files
    - Each batch file contains 'data' and 'labels' keys
    - Reshape data to (3, 32, 32) format
    - Store images and labels as separate lists
    - Add progress bar with urllib.request.urlretrieve(url, filename, reporthook=progress_function)
    - Progress function receives (block_num, block_size, total_size) parameters
    """

    def __init__(self, root_dir: str, train: bool = True, download: bool = True):
        """
        Initialize CIFAR-10 dataset.

        Args:
            root_dir: Directory to store/load dataset
            train: If True, load training data. If False, load test data.
            download: If True, download dataset if not present

        TODO: Implement CIFAR-10 initialization.

        STEP-BY-STEP:
        1. Create root directory if it doesn't exist
        2. Download dataset if needed and not present (with progress bar!)
        3. Load binary files and parse data
        4. Store images and labels in memory
        5. Set up class names

        EXAMPLE:
        CIFAR10Dataset("data/cifar10/", train=True)
        creates a dataset with 50,000 training images

        PROGRESS BAR HINT:
        def show_progress(block_num, block_size, total_size):
            downloaded = block_num * block_size
            percent = (downloaded * 100) // total_size
            print(f"\\rDownloading: {percent}%", end='', flush=True)
        """
        raise NotImplementedError("Student implementation required")

    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
        """
        Get a single image and label by index.

        Args:
            index: Index of the sample to retrieve

        Returns:
            Tuple of (image, label) tensors

        TODO: Implement sample retrieval.

        STEP-BY-STEP:
        1. Get image from self.images[index]
        2. Get label from self.labels[index]
        3. Return (Tensor(image), Tensor(label))

        EXAMPLE:
        image, label = dataset[0]
        image.shape should be (3, 32, 32)
        label should be integer 0-9
        """
        raise NotImplementedError("Student implementation required")

    def __len__(self) -> int:
        """
        Get the total number of samples in the dataset.

        TODO: Return the length of the dataset.

        STEP-BY-STEP:
        1. Return len(self.images)

        EXAMPLE:
        Training set: 50,000 samples
        Test set: 10,000 samples
        """
        raise NotImplementedError("Student implementation required")

    def get_num_classes(self) -> int:
        """
        Get the number of classes in CIFAR-10.

        TODO: Return the number of classes.

        STEP-BY-STEP:
        1. CIFAR-10 has 10 classes
        2. Return 10

        EXAMPLE:
        Returns 10 for CIFAR-10
        """
        raise NotImplementedError("Student implementation required")

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 12
class CIFAR10Dataset(Dataset):
    """CIFAR-10 Dataset: Load and manage CIFAR-10 image data."""

    def __init__(self, root_dir: str, train: bool = True, download: bool = True):
        self.root_dir = root_dir
        self.train = train
        self.class_names = ['airplane', 'car', 'bird', 'cat', 'deer',
                           'dog', 'frog', 'horse', 'ship', 'truck']

        # Create directory if it doesn't exist
        os.makedirs(root_dir, exist_ok=True)

        # Download if needed
        if download:
            self._download_if_needed()

        # Load data
        self._load_data()

    def _download_if_needed(self):
        """Download CIFAR-10 if not present."""
        cifar_path = os.path.join(self.root_dir, "cifar-10-batches-py")
        if not os.path.exists(cifar_path):
            print("🔄 Downloading CIFAR-10 dataset...")
            print("📦 Size: ~170MB (this may take a few minutes)")
            url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
            filename = os.path.join(self.root_dir, "cifar-10-python.tar.gz")

            try:
                # Download with progress bar
                def show_progress(block_num, block_size, total_size):
                    """Show download progress bar."""
                    downloaded = block_num * block_size
                    if total_size > 0:
                        percent = min(100, (downloaded * 100) // total_size)
                        bar_length = 50
                        filled_length = (percent * bar_length) // 100
                        bar = '█' * filled_length + '░' * (bar_length - filled_length)

                        # Convert bytes to MB
                        downloaded_mb = downloaded / (1024 * 1024)
                        total_mb = total_size / (1024 * 1024)

                        print(f"\r📥 [{bar}] {percent}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)", end='', flush=True)
                    else:
                        # Fallback if total size unknown
                        downloaded_mb = downloaded / (1024 * 1024)
                        print(f"\r📥 Downloaded: {downloaded_mb:.1f} MB", end='', flush=True)

                urllib.request.urlretrieve(url, filename, reporthook=show_progress)
                print()  # New line after progress bar

                # Extract
                print("📂 Extracting CIFAR-10 files...")
                with tarfile.open(filename, 'r:gz') as tar:
                    tar.extractall(self.root_dir, filter='data')

                # Clean up
                os.remove(filename)
                print("✅ CIFAR-10 downloaded and extracted successfully!")

            except Exception as e:
                print(f"\n❌ Download failed: {e}")
                print("Please download CIFAR-10 manually from https://www.cs.toronto.edu/~kriz/cifar.html")

    def _load_data(self):
        """Load CIFAR-10 data from binary files."""
        cifar_path = os.path.join(self.root_dir, "cifar-10-batches-py")

        self.images = []
        self.labels = []

        if self.train:
            # Load training batches
            for i in range(1, 6):
                batch_file = os.path.join(cifar_path, f"data_batch_{i}")
                if os.path.exists(batch_file):
                    with open(batch_file, 'rb') as f:
                        batch = pickle.load(f, encoding='bytes')
                        # Convert bytes keys to strings
                        batch = {k.decode('utf-8') if isinstance(k, bytes) else k: v for k, v in batch.items()}

                        # Extract images and labels
                        images = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
                        labels = batch['labels']

                        self.images.extend(images)
                        self.labels.extend(labels)
        else:
            # Load test batch
            test_file = os.path.join(cifar_path, "test_batch")
            if os.path.exists(test_file):
                with open(test_file, 'rb') as f:
                    batch = pickle.load(f, encoding='bytes')
                    # Convert bytes keys to strings
                    batch = {k.decode('utf-8') if isinstance(k, bytes) else k: v for k, v in batch.items()}

                    # Extract images and labels
                    self.images = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
                    self.labels = batch['labels']

        print(f"✅ Loaded {len(self.images)} {'training' if self.train else 'test'} samples")

    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
        """Get a single image and label by index."""
        image = Tensor(self.images[index])
        label = Tensor(np.array(self.labels[index]))
        return image, label

    def __len__(self) -> int:
        """Get the total number of samples in the dataset."""
        return len(self.images)

    def get_num_classes(self) -> int:
        """Get the number of classes in CIFAR-10."""
        return 10

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 20
class DataLoader:
    """
    DataLoader: Efficiently batch and iterate through datasets.

    Provides batching, shuffling, and efficient iteration over datasets.
    Essential for training neural networks efficiently.

    Args:
        dataset: Dataset to load from
        batch_size: Number of samples per batch
        shuffle: Whether to shuffle data each epoch

    TODO: Implement DataLoader with batching and shuffling.

    APPROACH:
    1. Store dataset and configuration
    2. Implement __iter__ to yield batches
    3. Handle shuffling and batching logic
    4. Stack individual samples into batches

    EXAMPLE:
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    for batch_images, batch_labels in dataloader:
        print(f"Batch shape: {batch_images.shape}")  # (32, 3, 32, 32)

    HINTS:
    - Use np.random.permutation for shuffling
    - Stack samples using np.stack
    - Yield batches as (batch_data, batch_labels)
    - Handle last batch that might be smaller
    """

    def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
        """
        Initialize DataLoader.

        Args:
            dataset: Dataset to load from
            batch_size: Number of samples per batch
            shuffle: Whether to shuffle data each epoch

        TODO: Store configuration and dataset.

        STEP-BY-STEP:
        1. Store dataset as self.dataset
        2. Store batch_size as self.batch_size
        3. Store shuffle as self.shuffle

        EXAMPLE:
        DataLoader(dataset, batch_size=32, shuffle=True)
        """
        raise NotImplementedError("Student implementation required")

    def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
        """
        Iterate through dataset in batches.

        Returns:
            Iterator yielding (batch_data, batch_labels) tuples

        TODO: Implement batching and shuffling logic.

        STEP-BY-STEP:
        1. Create indices list: list(range(len(dataset)))
        2. Shuffle indices if self.shuffle is True
        3. Loop through indices in batch_size chunks
        4. For each batch: collect samples, stack them, yield batch

        EXAMPLE:
        for batch_data, batch_labels in dataloader:
            # batch_data.shape: (batch_size, ...)
            # batch_labels.shape: (batch_size,)
        """
        raise NotImplementedError("Student implementation required")

    def __len__(self) -> int:
        """
        Get the number of batches per epoch.

        TODO: Calculate number of batches.

        STEP-BY-STEP:
        1. Get dataset size: len(self.dataset)
        2. Calculate: (dataset_size + batch_size - 1) // batch_size
        3. This handles the last partial batch correctly

        EXAMPLE:
        Dataset size: 100, batch_size: 32
        Number of batches: 4 (32, 32, 32, 4)
        """
        raise NotImplementedError("Student implementation required")

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 21
class DataLoader:
    """DataLoader: Efficiently batch and iterate through datasets."""

    def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle

    def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
        """Iterate through dataset in batches."""
        # Create indices
        indices = list(range(len(self.dataset)))

        # Shuffle if requested
        if self.shuffle:
            np.random.shuffle(indices)

        # Generate batches
        for i in range(0, len(indices), self.batch_size):
            batch_indices = indices[i:i + self.batch_size]

            # Collect samples for this batch
            batch_data = []
            batch_labels = []

            for idx in batch_indices:
                data, label = self.dataset[idx]
                batch_data.append(data.data)
                batch_labels.append(label.data)

            # Stack into batches
            batch_data = np.stack(batch_data, axis=0)
            batch_labels = np.stack(batch_labels, axis=0)

            yield Tensor(batch_data), Tensor(batch_labels)

    def __len__(self) -> int:
        """Get the number of batches per epoch."""
        return (len(self.dataset) + self.batch_size - 1) // self.batch_size

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 26
class Normalizer:
    """
    Data Normalizer: Standardize data for better training.

    Computes mean and standard deviation from training data,
    then applies normalization to new data.

    TODO: Implement data normalization.

    APPROACH:
    1. Fit: Compute mean and std from training data
    2. Transform: Apply normalization using computed stats
    3. Handle both single tensors and batches

    EXAMPLE:
    normalizer = Normalizer()
    normalizer.fit(training_data)  # Compute stats
    normalized = normalizer.transform(new_data)  # Apply normalization

    HINTS:
    - Store mean and std as instance variables
    - Use np.mean and np.std for statistics
    - Apply: (data - mean) / std
    - Handle division by zero (add small epsilon)
    """

    def __init__(self):
        """
        Initialize normalizer.

        TODO: Initialize mean and std to None.

        STEP-BY-STEP:
        1. Set self.mean = None
        2. Set self.std = None
        3. Set self.epsilon = 1e-8 (for numerical stability)

        EXAMPLE:
        normalizer = Normalizer()
        """
        raise NotImplementedError("Student implementation required")

    def fit(self, data: List[Tensor]):
        """
        Compute normalization statistics from training data.

        Args:
            data: List of tensors to compute statistics from

        TODO: Compute mean and standard deviation.

        STEP-BY-STEP:
        1. Stack all tensors: np.stack([t.data for t in data])
        2. Compute mean: np.mean(stacked_data)
        3. Compute std: np.std(stacked_data)
        4. Store as self.mean and self.std

        EXAMPLE:
        normalizer.fit([tensor1, tensor2, tensor3])
        """
        raise NotImplementedError("Student implementation required")

    def transform(self, data: Union[Tensor, List[Tensor]]) -> Union[Tensor, List[Tensor]]:
        """
        Apply normalization to data.

        Args:
            data: Tensor or list of tensors to normalize

        Returns:
            Normalized tensor(s)

        TODO: Apply normalization using computed statistics.

        STEP-BY-STEP:
        1. Check if mean and std are computed (not None)
        2. If single tensor: apply (data - mean) / (std + epsilon)
        3. If list: apply to each tensor in the list
        4. Return normalized data

        EXAMPLE:
        normalized = normalizer.transform(tensor)
        """
        raise NotImplementedError("Student implementation required")

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 27
class Normalizer:
    """Data Normalizer: Standardize data for better training."""

    def __init__(self):
        self.mean = None
        self.std = None
        self.epsilon = 1e-8

    def fit(self, data: List[Tensor]):
        """Compute normalization statistics from training data."""
        # Stack all data
        all_data = np.stack([t.data for t in data])

        # Compute statistics
        self.mean = np.mean(all_data)
        self.std = np.std(all_data)

        print(f"✅ Computed normalization stats: mean={self.mean:.4f}, std={self.std:.4f}")

    def transform(self, data: Union[Tensor, List[Tensor]]) -> Union[Tensor, List[Tensor]]:
        """Apply normalization to data."""
        if self.mean is None or self.std is None:
            raise ValueError("Must call fit() before transform()")

        if isinstance(data, list):
            # Transform list of tensors
            return [Tensor((t.data - self.mean) / (self.std + self.epsilon)) for t in data]
        else:
            # Transform single tensor
            return Tensor((data.data - self.mean) / (self.std + self.epsilon))

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 31
def create_data_pipeline(dataset_path: str = "data/cifar10/",
                        batch_size: int = 32,
                        normalize: bool = True,
                        shuffle: bool = True):
    """
    Create a complete data pipeline for training.

    Args:
        dataset_path: Path to dataset
        batch_size: Batch size for training
        normalize: Whether to normalize data
        shuffle: Whether to shuffle data

    Returns:
        Tuple of (train_loader, test_loader)

    TODO: Implement complete data pipeline.

    APPROACH:
    1. Create train and test datasets
    2. Create data loaders
    3. Fit normalizer on training data
    4. Return all components

    EXAMPLE:
    train_loader, test_loader = create_data_pipeline()
    for batch_data, batch_labels in train_loader:
        # Ready for training!
    """
    raise NotImplementedError("Student implementation required")

# %% ../../assignments/source/06_dataloader/dataloader_dev.ipynb 32
def create_data_pipeline(dataset_path: str = "data/cifar10/",
                        batch_size: int = 32,
                        normalize: bool = True,
                        shuffle: bool = True):
    """Create a complete data pipeline for training."""

    print("🔧 Creating data pipeline...")

    # Create datasets with real CIFAR-10 data
    train_dataset = CIFAR10Dataset(dataset_path, train=True, download=True)
    test_dataset = CIFAR10Dataset(dataset_path, train=False, download=True)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Create normalizer
    normalizer = None
    if normalize:
        normalizer = Normalizer()
        # Fit on a subset of training data for efficiency
        sample_data = [train_dataset[i][0] for i in range(min(1000, len(train_dataset)))]
        normalizer.fit(sample_data)
        print(f"✅ Computed normalization stats: mean={normalizer.mean:.4f}, std={normalizer.std:.4f}")

    print(f"✅ Pipeline created:")
    print(f"   - Training batches: {len(train_loader)}")
    print(f"   - Test batches: {len(test_loader)}")
    print(f"   - Batch size: {batch_size}")
    print(f"   - Normalization: {normalize}")

    return train_loader, test_loader