diff --git a/.gitignore b/.gitignore index 0fa523b8..4ab91713 100644 --- a/.gitignore +++ b/.gitignore @@ -147,12 +147,21 @@ training_*.json # Data (too large for git) data/ -datasets/ +# Downloaded datasets (large files, not committed) +datasets/* +# BUT allow tiny datasets directory (small files we ship) +!datasets/tiny/ +!datasets/README.md +!datasets/download_*.py *.csv -*.npz +# *.npz - Don't ignore .npz globally, some are tiny datasets *.npy *.pickle *.pkl +# Ignore large .npz files (but not in datasets/tiny/) +data/*.npz +datasets/mnist/*.npz +datasets/cifar10/*.npz # Temporary files tmp/ diff --git a/datasets/README.md b/datasets/README.md index c58b767a..1ca7e215 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -2,6 +2,31 @@ This directory contains datasets for TinyTorch examples and training. +## Directory Structure + +``` +datasets/ +├── tiny/ ← Tiny datasets shipped with repo (~100KB each) +│ └── digits_8x8.npz (1,797 samples, 67KB) +├── mnist/ ← Full MNIST (downloaded, gitignored) +├── cifar10/ ← Full CIFAR-10 (downloaded, gitignored) +└── download_*.py ← Download scripts for large datasets +``` + +## Quick Start + +**For learning (instant, offline):** +```python +# Use tiny shipped datasets +import numpy as np +data = np.load('datasets/tiny/digits_8x8.npz') +``` + +**For serious training (download once):** +```bash +python datasets/download_mnist.py +``` + ## MNIST Dataset The `mnist/` directory should contain the MNIST or Fashion-MNIST dataset files: diff --git a/datasets/tiny/README.md b/datasets/tiny/README.md new file mode 100644 index 00000000..71dde5ad --- /dev/null +++ b/datasets/tiny/README.md @@ -0,0 +1,133 @@ +# Tiny Datasets for TinyTorch + +**Small, curated datasets that ship with TinyTorch** - no downloads required! + +These datasets are committed to the repository for instant, offline-friendly learning. + +--- + +## 📊 Available Datasets + +### 8×8 Handwritten Digits + +**File:** `digits_8x8.npz` +**Size:** ~67 KB +**Samples:** 1,797 images +**Shape:** (8, 8) grayscale +**Classes:** 10 digits (0-9) +**Source:** UCI ML Repository via sklearn + +**Perfect for:** +- Learning DataLoader mechanics +- Quick CNN testing +- Offline development +- Educational demos + +**Usage:** +```python +import numpy as np +from tinytorch import Tensor +from tinytorch.data.loader import TensorDataset, DataLoader + +# Load the dataset +data = np.load('datasets/tiny/digits_8x8.npz') +images = Tensor(data['images']) +labels = Tensor(data['labels']) + +# Create dataset and loader +dataset = TensorDataset(images, labels) +loader = DataLoader(dataset, batch_size=32, shuffle=True) + +# Iterate through batches +for batch_images, batch_labels in loader: + print(f"Batch: {batch_images.shape}, Labels: {batch_labels.shape}") +``` + +**Visual Sample:** +``` +Digit "5": Digit "3": Digit "8": +░░████░░ ░█████░ ░█████░░ +░█░░░█░ ░░░░░█░ █░░░░░█░ +░░░░█░░ ░░███░░ ░█████░░ +░░░█░░░ ░░░░░█░ █░░░░░█░ +░░█░░░░ ░░████░░ ░█████░░ +``` + +--- + +## 🎯 Philosophy + +**Why ship tiny datasets?** + +1. **Zero friction** - Students start learning immediately +2. **Offline-first** - Works in classrooms, planes, anywhere +3. **Fast iteration** - No wait times, instant feedback +4. **Educational focus** - Sized for learning, not production + +**Progression:** +- **Tiny datasets** (here) → Learn DataLoader mechanics +- **Downloaded datasets** (../mnist/, ../cifar10/) → Real applications +- **Custom datasets** → Production skills + +--- + +## 📂 File Format + +All datasets use NumPy's `.npz` format (compressed): + +```python +data = np.load('dataset.npz') +images = data['images'] # Shape: (N, H, W) or (N, H, W, C) +labels = data['labels'] # Shape: (N,) +``` + +**Benefits:** +- Fast loading +- Compressed storage +- Python-native +- Easy inspection + +--- + +## 🔧 Creating New Tiny Datasets + +See `create_digits_8x8.py` for example extraction script. + +**Guidelines:** +- Max size: ~100 KB per dataset +- Format: `.npz` with `images` and `labels` keys +- Normalize: Images in [0, 1] range +- License: Verify public domain / open source + +--- + +## 📚 Dataset Information + +### Digits 8×8 Credits + +**Original Source:** +- E. Alpaydin, C. Kaynak (1998) +- UCI Machine Learning Repository +- "Optical Recognition of Handwritten Digits" + +**Preprocessing:** +- Extracted via `sklearn.datasets.load_digits()` +- Normalized from [0-16] to [0-1] +- Saved as float32 for efficiency + +**License:** Public domain + +--- + +## 🚀 Next Steps + +After mastering DataLoader with tiny datasets: + +1. **Module 08** → Build DataLoader with digits_8x8 +2. **Milestone 03** → Train MLP on full MNIST +3. **Milestone 04** → Train CNN on CIFAR-10 +4. **Custom datasets** → Apply to your own data + +Tiny datasets teach the mechanics. +Real datasets teach the systems. +Custom datasets teach the engineering. diff --git a/datasets/tiny/create_digits_8x8.py b/datasets/tiny/create_digits_8x8.py new file mode 100644 index 00000000..d3d61979 --- /dev/null +++ b/datasets/tiny/create_digits_8x8.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +""" +Create 8x8 Digits Dataset +========================= + +Extracts the 8×8 handwritten digits dataset from sklearn and saves it +as a compact .npz file for TinyTorch. + +Source: UCI Machine Learning Repository +Used by: sklearn.datasets.load_digits() +Size: 1,797 samples, 8×8 grayscale images +License: Public domain +""" + +import numpy as np + +try: + from sklearn.datasets import load_digits +except ImportError: + print("❌ sklearn not installed. Install with: pip install scikit-learn") + exit(1) + +print("📥 Loading 8×8 digits from sklearn...") +digits = load_digits() + +print(f"✅ Loaded {len(digits.images)} digit images") +print(f" Shape: {digits.images.shape}") +print(f" Classes: {np.unique(digits.target)}") + +# Normalize to [0, 1] range (original is 0-16) +images_normalized = digits.images.astype(np.float32) / 16.0 +labels = digits.target.astype(np.int64) + +# Save as compressed .npz +output_file = 'digits_8x8.npz' +np.savez_compressed(output_file, + images=images_normalized, + labels=labels) + +# Check file size +import os +file_size_kb = os.path.getsize(output_file) / 1024 +print(f"\n💾 Saved to {output_file}") +print(f" File size: {file_size_kb:.1f} KB") +print(f" Images shape: {images_normalized.shape}") +print(f" Labels shape: {labels.shape}") +print(f" Value range: [{images_normalized.min():.2f}, {images_normalized.max():.2f}]") + +# Quick verification +print(f"\n✅ Dataset ready for TinyTorch!") +print(f" Total samples: {len(images_normalized)}") +print(f" Samples per class: ~{len(images_normalized) // 10}") +print(f" Perfect for DataLoader demos!") diff --git a/datasets/tiny/digits_8x8.npz b/datasets/tiny/digits_8x8.npz new file mode 100644 index 00000000..c2b475b8 Binary files /dev/null and b/datasets/tiny/digits_8x8.npz differ