refactor: rename data module to dataloader

- Rename modules/data/ → modules/dataloader/ - Rename data_dev.py → dataloader_dev.py - Update NBDev export target: core.data → core.dataloader - Rename test files: test_data.py → test_dataloader.py - Update package exports to tinytorch.core.dataloader - Update module imports and internal references This makes the module name more descriptive and aligned with ML industry standards.
2026-04-29 20:08:33 -05:00 · 2025-07-11 18:59:09 -04:00
parent a1a3e0f9e4
commit 7c2b98a2b9
11 changed files with 2730 additions and 7 deletions
--- a/tinytorch/_modidx.py
+++ b/tinytorch/_modidx.py
@@ -23,6 +23,50 @@ d = { 'settings': { 'branch': 'main',
                                                                                          'tinytorch/core/activations.py'),
                                            'tinytorch.core.activations.Tanh.forward': ( 'activations/activations_dev.html#tanh.forward',
                                                                                         'tinytorch/core/activations.py')},
+            'tinytorch.core.dataloader': { 'tinytorch.core.dataloader.CIFAR10Dataset': ( 'dataloader/dataloader_dev.html#cifar10dataset',
+                                                                                         'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.CIFAR10Dataset.__getitem__': ( 'dataloader/dataloader_dev.html#cifar10dataset.__getitem__',
+                                                                                                     'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.CIFAR10Dataset.__init__': ( 'dataloader/dataloader_dev.html#cifar10dataset.__init__',
+                                                                                                  'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.CIFAR10Dataset.__len__': ( 'dataloader/dataloader_dev.html#cifar10dataset.__len__',
+                                                                                                 'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.CIFAR10Dataset._download_if_needed': ( 'dataloader/dataloader_dev.html#cifar10dataset._download_if_needed',
+                                                                                                             'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.CIFAR10Dataset._load_data': ( 'dataloader/dataloader_dev.html#cifar10dataset._load_data',
+                                                                                                    'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.CIFAR10Dataset.get_num_classes': ( 'dataloader/dataloader_dev.html#cifar10dataset.get_num_classes',
+                                                                                                         'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.DataLoader': ( 'dataloader/dataloader_dev.html#dataloader',
+                                                                                     'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.DataLoader.__init__': ( 'dataloader/dataloader_dev.html#dataloader.__init__',
+                                                                                              'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.DataLoader.__iter__': ( 'dataloader/dataloader_dev.html#dataloader.__iter__',
+                                                                                              'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.DataLoader.__len__': ( 'dataloader/dataloader_dev.html#dataloader.__len__',
+                                                                                             'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Dataset': ( 'dataloader/dataloader_dev.html#dataset',
+                                                                                  'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Dataset.__getitem__': ( 'dataloader/dataloader_dev.html#dataset.__getitem__',
+                                                                                              'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Dataset.__len__': ( 'dataloader/dataloader_dev.html#dataset.__len__',
+                                                                                          'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Dataset.get_num_classes': ( 'dataloader/dataloader_dev.html#dataset.get_num_classes',
+                                                                                                  'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Dataset.get_sample_shape': ( 'dataloader/dataloader_dev.html#dataset.get_sample_shape',
+                                                                                                   'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Normalizer': ( 'dataloader/dataloader_dev.html#normalizer',
+                                                                                     'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Normalizer.__init__': ( 'dataloader/dataloader_dev.html#normalizer.__init__',
+                                                                                              'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Normalizer.fit': ( 'dataloader/dataloader_dev.html#normalizer.fit',
+                                                                                         'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.Normalizer.transform': ( 'dataloader/dataloader_dev.html#normalizer.transform',
+                                                                                               'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader._should_show_plots': ( 'dataloader/dataloader_dev.html#_should_show_plots',
+                                                                                             'tinytorch/core/dataloader.py'),
+                                           'tinytorch.core.dataloader.create_data_pipeline': ( 'dataloader/dataloader_dev.html#create_data_pipeline',
+                                                                                               'tinytorch/core/dataloader.py')},
            'tinytorch.core.layers': { 'tinytorch.core.layers.Dense': ('layers/layers_dev.html#dense', 'tinytorch/core/layers.py'),
                                       'tinytorch.core.layers.Dense.__call__': ( 'layers/layers_dev.html#dense.__call__',
                                                                                 'tinytorch/core/layers.py'),
--- a/tinytorch/core/dataloader.py
+++ b/tinytorch/core/dataloader.py
@@ -0,0 +1,694 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/dataloader/dataloader_dev.ipynb.
+
+# %% auto 0
+__all__ = ['Dataset', 'CIFAR10Dataset', 'DataLoader', 'Normalizer', 'create_data_pipeline']
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 4
+import numpy as np
+import sys
+import os
+import pickle
+import struct
+from typing import List, Tuple, Optional, Union, Iterator
+import matplotlib.pyplot as plt
+import urllib.request
+import tarfile
+
+# Import our building blocks
+from .tensor import Tensor
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 5
+def _should_show_plots():
+    """Check if we should show plots (disable during testing)"""
+    return 'pytest' not in sys.modules and 'test' not in sys.argv
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 7
+class Dataset:
+    """
+    Base Dataset class: Abstract interface for all datasets.
+    
+    The fundamental abstraction for data loading in TinyTorch.
+    Students implement concrete datasets by inheriting from this class.
+    
+    TODO: Implement the base Dataset class with required methods.
+    
+    APPROACH:
+    1. Define the interface that all datasets must implement
+    2. Include methods for getting individual samples and dataset size
+    3. Make it easy to extend for different data types
+    
+    EXAMPLE:
+    dataset = CIFAR10Dataset("data/cifar10/")
+    sample, label = dataset[0]  # Get first sample
+    size = len(dataset)  # Get dataset size
+    
+    HINTS:
+    - Use abstract methods that subclasses must implement
+    - Include __getitem__ for indexing and __len__ for size
+    - Add helper methods for getting sample shapes and number of classes
+    """
+    
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
+        """
+        Get a single sample and label by index.
+        
+        Args:
+            index: Index of the sample to retrieve
+            
+        Returns:
+            Tuple of (data, label) tensors
+            
+        TODO: Implement abstract method for getting samples.
+        
+        STEP-BY-STEP:
+        1. This is an abstract method - subclasses will implement it
+        2. Return a tuple of (data, label) tensors
+        3. Data should be the input features, label should be the target
+        
+        EXAMPLE:
+        dataset[0] should return (Tensor(image_data), Tensor(label))
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def __len__(self) -> int:
+        """
+        Get the total number of samples in the dataset.
+        
+        TODO: Implement abstract method for getting dataset size.
+        
+        STEP-BY-STEP:
+        1. This is an abstract method - subclasses will implement it
+        2. Return the total number of samples in the dataset
+        
+        EXAMPLE:
+        len(dataset) should return 50000 for CIFAR-10 training set
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def get_sample_shape(self) -> Tuple[int, ...]:
+        """
+        Get the shape of a single data sample.
+        
+        TODO: Implement method to get sample shape.
+        
+        STEP-BY-STEP:
+        1. Get the first sample using self[0]
+        2. Extract the data part (first element of tuple)
+        3. Return the shape of the data tensor
+        
+        EXAMPLE:
+        For CIFAR-10: returns (3, 32, 32) for RGB images
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def get_num_classes(self) -> int:
+        """
+        Get the number of classes in the dataset.
+        
+        TODO: Implement abstract method for getting number of classes.
+        
+        STEP-BY-STEP:
+        1. This is an abstract method - subclasses will implement it
+        2. Return the total number of classes in the dataset
+        
+        EXAMPLE:
+        For CIFAR-10: returns 10 (airplane, car, bird, cat, deer, dog, frog, horse, ship, truck)
+        """
+        raise NotImplementedError("Student implementation required")
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 8
+class Dataset:
+    """Base Dataset class: Abstract interface for all datasets."""
+    
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
+        """Get a single sample and label by index."""
+        raise NotImplementedError("Subclasses must implement __getitem__")
+    
+    def __len__(self) -> int:
+        """Get the total number of samples in the dataset."""
+        raise NotImplementedError("Subclasses must implement __len__")
+    
+    def get_sample_shape(self) -> Tuple[int, ...]:
+        """Get the shape of a single data sample."""
+        sample, _ = self[0]
+        return sample.shape
+    
+    def get_num_classes(self) -> int:
+        """Get the number of classes in the dataset."""
+        raise NotImplementedError("Subclasses must implement get_num_classes")
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 12
+class CIFAR10Dataset(Dataset):
+    """
+    CIFAR-10 Dataset: Load and manage CIFAR-10 image data.
+    
+    CIFAR-10 contains 60,000 32x32 color images in 10 classes.
+    Perfect for learning data loading and image processing.
+    
+    Args:
+        root_dir: Directory containing CIFAR-10 files
+        train: If True, load training data. If False, load test data.
+        download: If True, download dataset if not present
+        
+    TODO: Implement CIFAR-10 dataset loading.
+    
+    APPROACH:
+    1. Handle dataset download if needed (with progress bar!)
+    2. Parse binary files to extract images and labels
+    3. Store data efficiently in memory
+    4. Implement indexing and size methods
+    
+    EXAMPLE:
+    dataset = CIFAR10Dataset("data/cifar10/", train=True)
+    image, label = dataset[0]  # Get first image
+    print(f"Image shape: {image.shape}")  # (3, 32, 32)
+    print(f"Label: {label}")  # Tensor with class index
+    
+    HINTS:
+    - Use pickle to load binary files
+    - Each batch file contains 'data' and 'labels' keys
+    - Reshape data to (3, 32, 32) format
+    - Store images and labels as separate lists
+    - Add progress bar with urllib.request.urlretrieve(url, filename, reporthook=progress_function)
+    - Progress function receives (block_num, block_size, total_size) parameters
+    """
+    
+    def __init__(self, root_dir: str, train: bool = True, download: bool = True):
+        """
+        Initialize CIFAR-10 dataset.
+        
+        Args:
+            root_dir: Directory to store/load dataset
+            train: If True, load training data. If False, load test data.
+            download: If True, download dataset if not present
+            
+        TODO: Implement CIFAR-10 initialization.
+        
+        STEP-BY-STEP:
+        1. Create root directory if it doesn't exist
+        2. Download dataset if needed and not present (with progress bar!)
+        3. Load binary files and parse data
+        4. Store images and labels in memory
+        5. Set up class names
+        
+        EXAMPLE:
+        CIFAR10Dataset("data/cifar10/", train=True)
+        creates a dataset with 50,000 training images
+        
+        PROGRESS BAR HINT:
+        def show_progress(block_num, block_size, total_size):
+            downloaded = block_num * block_size
+            percent = (downloaded * 100) // total_size
+            print(f"\\rDownloading: {percent}%", end='', flush=True)
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
+        """
+        Get a single image and label by index.
+        
+        Args:
+            index: Index of the sample to retrieve
+            
+        Returns:
+            Tuple of (image, label) tensors
+            
+        TODO: Implement sample retrieval.
+        
+        STEP-BY-STEP:
+        1. Get image from self.images[index]
+        2. Get label from self.labels[index]
+        3. Return (Tensor(image), Tensor(label))
+        
+        EXAMPLE:
+        image, label = dataset[0]
+        image.shape should be (3, 32, 32)
+        label should be integer 0-9
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def __len__(self) -> int:
+        """
+        Get the total number of samples in the dataset.
+        
+        TODO: Return the length of the dataset.
+        
+        STEP-BY-STEP:
+        1. Return len(self.images)
+        
+        EXAMPLE:
+        Training set: 50,000 samples
+        Test set: 10,000 samples
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def get_num_classes(self) -> int:
+        """
+        Get the number of classes in CIFAR-10.
+        
+        TODO: Return the number of classes.
+        
+        STEP-BY-STEP:
+        1. CIFAR-10 has 10 classes
+        2. Return 10
+        
+        EXAMPLE:
+        Returns 10 for CIFAR-10
+        """
+        raise NotImplementedError("Student implementation required")
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 13
+class CIFAR10Dataset(Dataset):
+    """CIFAR-10 Dataset: Load and manage CIFAR-10 image data."""
+    
+    def __init__(self, root_dir: str, train: bool = True, download: bool = True):
+        self.root_dir = root_dir
+        self.train = train
+        self.class_names = ['airplane', 'car', 'bird', 'cat', 'deer', 
+                           'dog', 'frog', 'horse', 'ship', 'truck']
+        
+        # Create directory if it doesn't exist
+        os.makedirs(root_dir, exist_ok=True)
+        
+        # Download if needed
+        if download:
+            self._download_if_needed()
+        
+        # Load data
+        self._load_data()
+    
+    def _download_if_needed(self):
+        """Download CIFAR-10 if not present."""
+        cifar_path = os.path.join(self.root_dir, "cifar-10-batches-py")
+        if not os.path.exists(cifar_path):
+            print("🔄 Downloading CIFAR-10 dataset...")
+            print("📦 Size: ~170MB (this may take a few minutes)")
+            url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
+            filename = os.path.join(self.root_dir, "cifar-10-python.tar.gz")
+            
+            try:
+                # Download with progress bar
+                def show_progress(block_num, block_size, total_size):
+                    """Show download progress bar."""
+                    downloaded = block_num * block_size
+                    if total_size > 0:
+                        percent = min(100, (downloaded * 100) // total_size)
+                        bar_length = 50
+                        filled_length = (percent * bar_length) // 100
+                        bar = '█' * filled_length + '░' * (bar_length - filled_length)
+                        
+                        # Convert bytes to MB
+                        downloaded_mb = downloaded / (1024 * 1024)
+                        total_mb = total_size / (1024 * 1024)
+                        
+                        print(f"\r📥 [{bar}] {percent}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)", end='', flush=True)
+                    else:
+                        # Fallback if total size unknown
+                        downloaded_mb = downloaded / (1024 * 1024)
+                        print(f"\r📥 Downloaded: {downloaded_mb:.1f} MB", end='', flush=True)
+                
+                urllib.request.urlretrieve(url, filename, reporthook=show_progress)
+                print()  # New line after progress bar
+                
+                # Extract
+                print("📂 Extracting CIFAR-10 files...")
+                with tarfile.open(filename, 'r:gz') as tar:
+                    tar.extractall(self.root_dir, filter='data')
+                
+                # Clean up
+                os.remove(filename)
+                print("✅ CIFAR-10 downloaded and extracted successfully!")
+                
+            except Exception as e:
+                print(f"\n❌ Download failed: {e}")
+                print("Please download CIFAR-10 manually from https://www.cs.toronto.edu/~kriz/cifar.html")
+    
+    def _load_data(self):
+        """Load CIFAR-10 data from binary files."""
+        cifar_path = os.path.join(self.root_dir, "cifar-10-batches-py")
+        
+        self.images = []
+        self.labels = []
+        
+        if self.train:
+            # Load training batches
+            for i in range(1, 6):
+                batch_file = os.path.join(cifar_path, f"data_batch_{i}")
+                if os.path.exists(batch_file):
+                    with open(batch_file, 'rb') as f:
+                        batch = pickle.load(f, encoding='bytes')
+                        # Convert bytes keys to strings
+                        batch = {k.decode('utf-8') if isinstance(k, bytes) else k: v for k, v in batch.items()}
+                        
+                        # Extract images and labels
+                        images = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
+                        labels = batch['labels']
+                        
+                        self.images.extend(images)
+                        self.labels.extend(labels)
+        else:
+            # Load test batch
+            test_file = os.path.join(cifar_path, "test_batch")
+            if os.path.exists(test_file):
+                with open(test_file, 'rb') as f:
+                    batch = pickle.load(f, encoding='bytes')
+                    # Convert bytes keys to strings
+                    batch = {k.decode('utf-8') if isinstance(k, bytes) else k: v for k, v in batch.items()}
+                    
+                    # Extract images and labels
+                    self.images = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
+                    self.labels = batch['labels']
+        
+        print(f"✅ Loaded {len(self.images)} {'training' if self.train else 'test'} samples")
+    
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
+        """Get a single image and label by index."""
+        image = Tensor(self.images[index])
+        label = Tensor(np.array(self.labels[index]))
+        return image, label
+    
+    def __len__(self) -> int:
+        """Get the total number of samples in the dataset."""
+        return len(self.images)
+    
+    def get_num_classes(self) -> int:
+        """Get the number of classes in CIFAR-10."""
+        return 10
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 21
+class DataLoader:
+    """
+    DataLoader: Efficiently batch and iterate through datasets.
+    
+    Provides batching, shuffling, and efficient iteration over datasets.
+    Essential for training neural networks efficiently.
+    
+    Args:
+        dataset: Dataset to load from
+        batch_size: Number of samples per batch
+        shuffle: Whether to shuffle data each epoch
+        
+    TODO: Implement DataLoader with batching and shuffling.
+    
+    APPROACH:
+    1. Store dataset and configuration
+    2. Implement __iter__ to yield batches
+    3. Handle shuffling and batching logic
+    4. Stack individual samples into batches
+    
+    EXAMPLE:
+    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
+    for batch_images, batch_labels in dataloader:
+        print(f"Batch shape: {batch_images.shape}")  # (32, 3, 32, 32)
+    
+    HINTS:
+    - Use np.random.permutation for shuffling
+    - Stack samples using np.stack
+    - Yield batches as (batch_data, batch_labels)
+    - Handle last batch that might be smaller
+    """
+    
+    def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
+        """
+        Initialize DataLoader.
+        
+        Args:
+            dataset: Dataset to load from
+            batch_size: Number of samples per batch
+            shuffle: Whether to shuffle data each epoch
+            
+        TODO: Store configuration and dataset.
+        
+        STEP-BY-STEP:
+        1. Store dataset as self.dataset
+        2. Store batch_size as self.batch_size
+        3. Store shuffle as self.shuffle
+        
+        EXAMPLE:
+        DataLoader(dataset, batch_size=32, shuffle=True)
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
+        """
+        Iterate through dataset in batches.
+        
+        Returns:
+            Iterator yielding (batch_data, batch_labels) tuples
+            
+        TODO: Implement batching and shuffling logic.
+        
+        STEP-BY-STEP:
+        1. Create indices list: list(range(len(dataset)))
+        2. Shuffle indices if self.shuffle is True
+        3. Loop through indices in batch_size chunks
+        4. For each batch: collect samples, stack them, yield batch
+        
+        EXAMPLE:
+        for batch_data, batch_labels in dataloader:
+            # batch_data.shape: (batch_size, ...)
+            # batch_labels.shape: (batch_size,)
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def __len__(self) -> int:
+        """
+        Get the number of batches per epoch.
+        
+        TODO: Calculate number of batches.
+        
+        STEP-BY-STEP:
+        1. Get dataset size: len(self.dataset)
+        2. Calculate: (dataset_size + batch_size - 1) // batch_size
+        3. This handles the last partial batch correctly
+        
+        EXAMPLE:
+        Dataset size: 100, batch_size: 32
+        Number of batches: 4 (32, 32, 32, 4)
+        """
+        raise NotImplementedError("Student implementation required")
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 22
+class DataLoader:
+    """DataLoader: Efficiently batch and iterate through datasets."""
+    
+    def __init__(self, dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+    
+    def __iter__(self) -> Iterator[Tuple[Tensor, Tensor]]:
+        """Iterate through dataset in batches."""
+        # Create indices
+        indices = list(range(len(self.dataset)))
+        
+        # Shuffle if requested
+        if self.shuffle:
+            np.random.shuffle(indices)
+        
+        # Generate batches
+        for i in range(0, len(indices), self.batch_size):
+            batch_indices = indices[i:i + self.batch_size]
+            
+            # Collect samples for this batch
+            batch_data = []
+            batch_labels = []
+            
+            for idx in batch_indices:
+                data, label = self.dataset[idx]
+                batch_data.append(data.data)
+                batch_labels.append(label.data)
+            
+            # Stack into batches
+            batch_data = np.stack(batch_data, axis=0)
+            batch_labels = np.stack(batch_labels, axis=0)
+            
+            yield Tensor(batch_data), Tensor(batch_labels)
+    
+    def __len__(self) -> int:
+        """Get the number of batches per epoch."""
+        return (len(self.dataset) + self.batch_size - 1) // self.batch_size
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 27
+class Normalizer:
+    """
+    Data Normalizer: Standardize data for better training.
+    
+    Computes mean and standard deviation from training data,
+    then applies normalization to new data.
+    
+    TODO: Implement data normalization.
+    
+    APPROACH:
+    1. Fit: Compute mean and std from training data
+    2. Transform: Apply normalization using computed stats
+    3. Handle both single tensors and batches
+    
+    EXAMPLE:
+    normalizer = Normalizer()
+    normalizer.fit(training_data)  # Compute stats
+    normalized = normalizer.transform(new_data)  # Apply normalization
+    
+    HINTS:
+    - Store mean and std as instance variables
+    - Use np.mean and np.std for statistics
+    - Apply: (data - mean) / std
+    - Handle division by zero (add small epsilon)
+    """
+    
+    def __init__(self):
+        """
+        Initialize normalizer.
+        
+        TODO: Initialize mean and std to None.
+        
+        STEP-BY-STEP:
+        1. Set self.mean = None
+        2. Set self.std = None
+        3. Set self.epsilon = 1e-8 (for numerical stability)
+        
+        EXAMPLE:
+        normalizer = Normalizer()
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def fit(self, data: List[Tensor]):
+        """
+        Compute normalization statistics from training data.
+        
+        Args:
+            data: List of tensors to compute statistics from
+            
+        TODO: Compute mean and standard deviation.
+        
+        STEP-BY-STEP:
+        1. Stack all tensors: np.stack([t.data for t in data])
+        2. Compute mean: np.mean(stacked_data)
+        3. Compute std: np.std(stacked_data)
+        4. Store as self.mean and self.std
+        
+        EXAMPLE:
+        normalizer.fit([tensor1, tensor2, tensor3])
+        """
+        raise NotImplementedError("Student implementation required")
+    
+    def transform(self, data: Union[Tensor, List[Tensor]]) -> Union[Tensor, List[Tensor]]:
+        """
+        Apply normalization to data.
+        
+        Args:
+            data: Tensor or list of tensors to normalize
+            
+        Returns:
+            Normalized tensor(s)
+            
+        TODO: Apply normalization using computed statistics.
+        
+        STEP-BY-STEP:
+        1. Check if mean and std are computed (not None)
+        2. If single tensor: apply (data - mean) / (std + epsilon)
+        3. If list: apply to each tensor in the list
+        4. Return normalized data
+        
+        EXAMPLE:
+        normalized = normalizer.transform(tensor)
+        """
+        raise NotImplementedError("Student implementation required")
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 28
+class Normalizer:
+    """Data Normalizer: Standardize data for better training."""
+    
+    def __init__(self):
+        self.mean = None
+        self.std = None
+        self.epsilon = 1e-8
+    
+    def fit(self, data: List[Tensor]):
+        """Compute normalization statistics from training data."""
+        # Stack all data
+        all_data = np.stack([t.data for t in data])
+        
+        # Compute statistics
+        self.mean = np.mean(all_data)
+        self.std = np.std(all_data)
+        
+        print(f"✅ Computed normalization stats: mean={self.mean:.4f}, std={self.std:.4f}")
+    
+    def transform(self, data: Union[Tensor, List[Tensor]]) -> Union[Tensor, List[Tensor]]:
+        """Apply normalization to data."""
+        if self.mean is None or self.std is None:
+            raise ValueError("Must call fit() before transform()")
+        
+        if isinstance(data, list):
+            # Transform list of tensors
+            return [Tensor((t.data - self.mean) / (self.std + self.epsilon)) for t in data]
+        else:
+            # Transform single tensor
+            return Tensor((data.data - self.mean) / (self.std + self.epsilon))
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 32
+def create_data_pipeline(dataset_path: str = "data/cifar10/", 
+                        batch_size: int = 32, 
+                        normalize: bool = True,
+                        shuffle: bool = True):
+    """
+    Create a complete data pipeline for training.
+    
+    Args:
+        dataset_path: Path to dataset
+        batch_size: Batch size for training
+        normalize: Whether to normalize data
+        shuffle: Whether to shuffle data
+        
+    Returns:
+        Tuple of (train_loader, test_loader)
+        
+    TODO: Implement complete data pipeline.
+    
+    APPROACH:
+    1. Create train and test datasets
+    2. Create data loaders
+    3. Fit normalizer on training data
+    4. Return all components
+    
+    EXAMPLE:
+    train_loader, test_loader = create_data_pipeline()
+    for batch_data, batch_labels in train_loader:
+        # Ready for training!
+    """
+    raise NotImplementedError("Student implementation required")
+
+# %% ../../modules/dataloader/dataloader_dev.ipynb 33
+def create_data_pipeline(dataset_path: str = "data/cifar10/", 
+                        batch_size: int = 32, 
+                        normalize: bool = True,
+                        shuffle: bool = True):
+    """Create a complete data pipeline for training."""
+    
+    print("🔧 Creating data pipeline...")
+    
+    # Create datasets with real CIFAR-10 data
+    train_dataset = CIFAR10Dataset(dataset_path, train=True, download=True)
+    test_dataset = CIFAR10Dataset(dataset_path, train=False, download=True)
+    
+    # Create data loaders
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    
+    # Create normalizer
+    normalizer = None
+    if normalize:
+        normalizer = Normalizer()
+        # Fit on a subset of training data for efficiency
+        sample_data = [train_dataset[i][0] for i in range(min(1000, len(train_dataset)))]
+        normalizer.fit(sample_data)
+        print(f"✅ Computed normalization stats: mean={normalizer.mean:.4f}, std={normalizer.std:.4f}")
+    
+    print(f"✅ Pipeline created:")
+    print(f"   - Training batches: {len(train_loader)}")
+    print(f"   - Test batches: {len(test_loader)}")
+    print(f"   - Batch size: {batch_size}")
+    print(f"   - Normalization: {normalize}")
+    
+    return train_loader, test_loader