Files
TinyTorch/dev/datasets.html
2025-11-20 05:18:34 +00:00

958 lines
44 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
<title>TinyTorch Datasets &#8212; Tiny🔥Torch</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="_static/pygments.css" />
<link rel="stylesheet" href="_static/styles/sphinx-book-theme.css?digest=14f4ca6b54d191a8c7657f6c759bf11a5fb86285" type="text/css" />
<link rel="stylesheet" type="text/css" href="_static/togglebutton.css" />
<link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css" />
<link rel="stylesheet" type="text/css" href="_static/sphinx-thebe.css" />
<link rel="stylesheet" type="text/css" href="_static/custom.css" />
<link rel="stylesheet" type="text/css" href="_static/design-style.4045f2051d55cab465a707391d5b2007.min.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
<script src="_static/jquery.js"></script>
<script src="_static/underscore.js"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js"></script>
<script src="_static/doctools.js"></script>
<script src="_static/clipboard.min.js"></script>
<script src="_static/copybutton.js"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="_static/togglebutton.js"></script>
<script src="_static/ml-timeline.js"></script>
<script src="_static/wip-banner.js"></script>
<script src="_static/marimo-badges.js"></script>
<script src="_static/sidebar-link.js"></script>
<script src="_static/hero-carousel.js"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="_static/design-tabs.js"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"
const thebe_selector = ".thebe,.cell"
const thebe_selector_input = "pre"
const thebe_selector_output = ".output, .cell_output"
</script>
<script async="async" src="_static/sphinx-thebe.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'datasets';</script>
<link rel="shortcut icon" href="_static/favicon.svg"/>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Community Ecosystem" href="community.html" />
<link rel="prev" title="Troubleshooting Guide" href="tito/troubleshooting.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search this book..."
aria-label="Search this book..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="intro.html">
<img src="_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
<script>document.write(`<img src="_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<ul class="nav bd-sidenav bd-sidenav__home-link">
<li class="toctree-l1">
<a class="reference internal" href="intro.html">
Getting Started
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="quickstart-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="student-workflow.html">Student Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage-paths/classroom-use.html">For Instructors</a></li>
<li class="toctree-l1"><a class="reference internal" href="instructor-guide.html">Instructor Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage-paths/ta-guide.html">TA Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage-paths/team-onboarding.html">Team Onboarding</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules/20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="credits.html">Credits &amp; Acknowledgments</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><label class="sidebar-toggle primary-toggle btn btn-sm" for="__primary" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</label></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-source-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Source repositories">
<i class="fab fa-github"></i>
</button>
<ul class="dropdown-menu">
<li><a href="https://github.com/mlsysbook/TinyTorch" target="_blank"
class="btn btn-sm btn-source-repository-button dropdown-item"
title="Source repository"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fab fa-github"></i>
</span>
<span class="btn__text-container">Repository</span>
</a>
</li>
<li><a href="https://github.com/mlsysbook/TinyTorch/edit/main/site/datasets.md" target="_blank"
class="btn btn-sm btn-source-edit-button dropdown-item"
title="Suggest edit"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-pencil-alt"></i>
</span>
<span class="btn__text-container">Suggest edit</span>
</a>
</li>
<li><a href="https://github.com/mlsysbook/TinyTorch/issues/new?title=Issue%20on%20page%20%2Fdatasets.html&body=Your%20issue%20content%20here." target="_blank"
class="btn btn-sm btn-source-issues-button dropdown-item"
title="Open an issue"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-lightbulb"></i>
</span>
<span class="btn__text-container">Open issue</span>
</a>
</li>
</ul>
</div>
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="_sources/datasets.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<label class="sidebar-toggle secondary-toggle btn btn-sm" for="__secondary"title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</label>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>TinyTorch Datasets</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#design-philosophy">Design Philosophy</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#shipped-datasets-included-with-tinytorch">Shipped Datasets (Included with TinyTorch)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tinydigits-handwritten-digit-recognition">TinyDigits - Handwritten Digit Recognition</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tinytalks-conversational-q-a-dataset">TinyTalks - Conversational Q&amp;A Dataset</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#downloaded-datasets-auto-downloaded-on-demand">Downloaded Datasets (Auto-Downloaded On-Demand)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mnist-handwritten-digit-classification">MNIST - Handwritten Digit Classification</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#cifar-10-natural-image-classification">CIFAR-10 - Natural Image Classification</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#dataset-selection-rationale">Dataset Selection Rationale</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#why-these-specific-datasets">Why These Specific Datasets?</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#accessing-datasets">Accessing Datasets</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#for-students">For Students</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#for-developers-researchers">For Developers/Researchers</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#dataset-sizes-summary">Dataset Sizes Summary</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-ship-with-repo-matters">Why Ship-with-Repo Matters</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#frequently-asked-questions">Frequently Asked Questions</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#related-documentation">Related Documentation</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="tinytorch-datasets">
<h1>TinyTorch Datasets<a class="headerlink" href="#tinytorch-datasets" title="Permalink to this heading">#</a></h1>
<div style="background: #f8f9fa; padding: 2rem; border-radius: 0.5rem; margin: 2rem 0; text-align: center;">
<h2 style="margin: 0 0 1rem 0; color: #495057;">Ship-with-Repo Datasets for Fast Learning</h2>
<p style="margin: 0; font-size: 1.1rem; color: #6c757d;">Small datasets for instant iteration + standard benchmarks for validation</p>
</div>
<p><strong>Purpose</strong>: Understand TinyTorchs dataset strategy and where to find each dataset used in milestones.</p>
<section id="design-philosophy">
<h2>Design Philosophy<a class="headerlink" href="#design-philosophy" title="Permalink to this heading">#</a></h2>
<p>TinyTorch uses a two-tier dataset approach:</p>
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1.5rem; margin: 2rem 0;">
<div style="background: #e3f2fd; border: 1px solid #2196f3; padding: 1.5rem; border-radius: 0.5rem;">
<h3 style="margin: 0 0 1rem 0; color: #1976d2;">Shipped Datasets</h3>
<p style="margin: 0 0 1rem 0;"><strong>~350 KB total - Ships with repository</strong></p>
<ul style="margin: 0; font-size: 0.9rem;">
<li>Small enough to fit in Git (~1K samples each)</li>
<li>Fast training (seconds to minutes)</li>
<li>Instant gratification for learners</li>
<li>Works offline - no download needed</li>
<li>Perfect for rapid iteration</li>
</ul>
</div>
<div style="background: #f3e5f5; border: 1px solid #9c27b0; padding: 1.5rem; border-radius: 0.5rem;">
<h3 style="margin: 0 0 1rem 0; color: #7b1fa2;">Downloaded Datasets</h3>
<p style="margin: 0 0 1rem 0;"><strong>~180 MB - Auto-downloaded when needed</strong></p>
<ul style="margin: 0; font-size: 0.9rem;">
<li>Standard ML benchmarks (MNIST, CIFAR-10)</li>
<li>Larger scale (~60K samples)</li>
<li>Used for validation and scaling</li>
<li>Downloaded automatically by milestones</li>
<li>Cached locally for reuse</li>
</ul>
</div>
</div>
<p><strong>Philosophy</strong>: Following Andrej Karpathys “~1K samples” approach—small datasets for learning, full benchmarks for validation.</p>
</section>
<hr class="docutils" />
<section id="shipped-datasets-included-with-tinytorch">
<h2>Shipped Datasets (Included with TinyTorch)<a class="headerlink" href="#shipped-datasets-included-with-tinytorch" title="Permalink to this heading">#</a></h2>
<section id="tinydigits-handwritten-digit-recognition">
<h3>TinyDigits - Handwritten Digit Recognition<a class="headerlink" href="#tinydigits-handwritten-digit-recognition" title="Permalink to this heading">#</a></h3>
<div style="background: #fff5f5; border-left: 4px solid #e74c3c; padding: 1.5rem; margin: 1.5rem 0;">
<p><strong>Location</strong>: <code class="docutils literal notranslate"><span class="pre">datasets/tinydigits/</span></code><br />
<strong>Size</strong>: ~310 KB<br />
<strong>Used by</strong>: Milestones 03 &amp; 04 (MLP and CNN examples)</p>
<p><strong>Contents:</strong></p>
<ul class="simple">
<li><p>1,000 training samples</p></li>
<li><p>200 test samples</p></li>
<li><p>8×8 grayscale images (downsampled from MNIST)</p></li>
<li><p>10 classes (digits 0-9)</p></li>
</ul>
<p><strong>Format</strong>: Python pickle file with NumPy arrays</p>
<p><strong>Why 8×8?</strong></p>
<ul class="simple">
<li><p>Fast iteration: Trains in seconds</p></li>
<li><p>Memory-friendly: Small enough to debug</p></li>
<li><p>Conceptually complete: Same challenges as 28×28 MNIST</p></li>
<li><p>Git-friendly: Only 310 KB vs 10 MB for full MNIST</p></li>
</ul>
<p><strong>Usage in milestones:</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Automatically loaded by milestones</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">datasets.tinydigits</span><span class="w"> </span><span class="kn">import</span> <span class="n">load_tinydigits</span>
<span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">,</span> <span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span> <span class="o">=</span> <span class="n">load_tinydigits</span><span class="p">()</span>
<span class="c1"># X_train shape: (1000, 8, 8)</span>
<span class="c1"># y_train shape: (1000,)</span>
</pre></div>
</div>
</div>
</section>
<section id="tinytalks-conversational-q-a-dataset">
<h3>TinyTalks - Conversational Q&amp;A Dataset<a class="headerlink" href="#tinytalks-conversational-q-a-dataset" title="Permalink to this heading">#</a></h3>
<div style="background: #f0fff4; border-left: 4px solid #22c55e; padding: 1.5rem; margin: 1.5rem 0;">
<p><strong>Location</strong>: <code class="docutils literal notranslate"><span class="pre">datasets/tinytalks/</span></code><br />
<strong>Size</strong>: ~40 KB<br />
<strong>Used by</strong>: Milestone 05 (Transformer/GPT text generation)</p>
<p><strong>Contents:</strong></p>
<ul class="simple">
<li><p>350 Q&amp;A pairs across 5 difficulty levels</p></li>
<li><p>Character-level text data</p></li>
<li><p>Topics: General knowledge, math, science, reasoning</p></li>
<li><p>Balanced difficulty distribution</p></li>
</ul>
<p><strong>Format</strong>: Plain text files with Q: / A: format</p>
<p><strong>Why conversational format?</strong></p>
<ul class="simple">
<li><p>Engaging: Questions feel natural</p></li>
<li><p>Varied: Different answer lengths and complexity</p></li>
<li><p>Educational: Difficulty levels scaffold learning</p></li>
<li><p>Practical: Mirrors real chatbot use cases</p></li>
</ul>
<p><strong>Example:</strong></p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Q: What is the capital of France?
A: Paris
Q: If a train travels 120 km in 2 hours, what is its average speed?
A: 60 km/h
</pre></div>
</div>
<p><strong>Usage in milestones:</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Automatically loaded by transformer milestones</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">datasets.tinytalks</span><span class="w"> </span><span class="kn">import</span> <span class="n">load_tinytalks</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">load_tinytalks</span><span class="p">()</span>
<span class="c1"># Returns list of (question, answer) pairs</span>
</pre></div>
</div>
<p>See detailed documentation: <code class="docutils literal notranslate"><span class="pre">datasets/tinytalks/README.md</span></code></p>
</div>
</section>
</section>
<hr class="docutils" />
<section id="downloaded-datasets-auto-downloaded-on-demand">
<h2>Downloaded Datasets (Auto-Downloaded On-Demand)<a class="headerlink" href="#downloaded-datasets-auto-downloaded-on-demand" title="Permalink to this heading">#</a></h2>
<p>These standard benchmarks download automatically when you run relevant milestone scripts:</p>
<section id="mnist-handwritten-digit-classification">
<h3>MNIST - Handwritten Digit Classification<a class="headerlink" href="#mnist-handwritten-digit-classification" title="Permalink to this heading">#</a></h3>
<div style="background: #fffbeb; border-left: 4px solid #f59e0b; padding: 1.5rem; margin: 1.5rem 0;">
<p><strong>Downloads to</strong>: <code class="docutils literal notranslate"><span class="pre">milestones/datasets/mnist/</span></code><br />
<strong>Size</strong>: ~10 MB (compressed)<br />
<strong>Used by</strong>: <code class="docutils literal notranslate"><span class="pre">milestones/03_1986_mlp/02_rumelhart_mnist.py</span></code></p>
<p><strong>Contents:</strong></p>
<ul class="simple">
<li><p>60,000 training samples</p></li>
<li><p>10,000 test samples</p></li>
<li><p>28×28 grayscale images</p></li>
<li><p>10 classes (digits 0-9)</p></li>
</ul>
<p><strong>Auto-download</strong>: When you run the MNIST milestone script, it automatically:</p>
<ol class="arabic simple">
<li><p>Checks if data exists locally</p></li>
<li><p>Downloads if needed (~10 MB)</p></li>
<li><p>Caches for future runs</p></li>
<li><p>Loads data using your TinyTorch DataLoader</p></li>
</ol>
<p><strong>Purpose</strong>: Validate that your framework achieves production-level results (95%+ accuracy target)</p>
<p><strong>Milestone goal</strong>: Implement backpropagation and achieve 95%+ accuracy—matching 1986 Rumelharts breakthrough.</p>
</div>
</section>
<section id="cifar-10-natural-image-classification">
<h3>CIFAR-10 - Natural Image Classification<a class="headerlink" href="#cifar-10-natural-image-classification" title="Permalink to this heading">#</a></h3>
<div style="background: #fdf2f8; border-left: 4px solid #ec4899; padding: 1.5rem; margin: 1.5rem 0;">
<p><strong>Downloads to</strong>: <code class="docutils literal notranslate"><span class="pre">milestones/datasets/cifar-10/</span></code><br />
<strong>Size</strong>: ~170 MB (compressed)<br />
<strong>Used by</strong>: <code class="docutils literal notranslate"><span class="pre">milestones/04_1998_cnn/02_lecun_cifar10.py</span></code></p>
<p><strong>Contents:</strong></p>
<ul class="simple">
<li><p>50,000 training samples</p></li>
<li><p>10,000 test samples</p></li>
<li><p>32×32 RGB images</p></li>
<li><p>10 classes (airplane, car, bird, cat, deer, dog, frog, horse, ship, truck)</p></li>
</ul>
<p><strong>Auto-download</strong>: Milestone script handles everything:</p>
<ol class="arabic simple">
<li><p>Downloads from official source</p></li>
<li><p>Verifies integrity</p></li>
<li><p>Caches locally</p></li>
<li><p>Preprocesses for your framework</p></li>
</ol>
<p><strong>Purpose</strong>: Prove your CNN implementation works on real natural images (75%+ accuracy target)</p>
<p><strong>Milestone goal</strong>: Build LeNet-style CNN achieving 75%+ accuracy—demonstrating spatial intelligence.</p>
</div>
</section>
</section>
<hr class="docutils" />
<section id="dataset-selection-rationale">
<h2>Dataset Selection Rationale<a class="headerlink" href="#dataset-selection-rationale" title="Permalink to this heading">#</a></h2>
<section id="why-these-specific-datasets">
<h3>Why These Specific Datasets?<a class="headerlink" href="#why-these-specific-datasets" title="Permalink to this heading">#</a></h3>
<p><strong>TinyDigits (not full MNIST):</strong></p>
<ul class="simple">
<li><p>100× faster training iterations</p></li>
<li><p>Ships with repo (no download)</p></li>
<li><p>Same conceptual challenges</p></li>
<li><p>Perfect for learning and debugging</p></li>
</ul>
<p><strong>TinyTalks (custom dataset):</strong></p>
<ul class="simple">
<li><p>Designed for educational progression</p></li>
<li><p>Scaffolded difficulty levels</p></li>
<li><p>Character-level tokenization friendly</p></li>
<li><p>Engaging conversational format</p></li>
</ul>
<p><strong>MNIST (when scaling up):</strong></p>
<ul class="simple">
<li><p>Industry standard benchmark</p></li>
<li><p>Validates your implementation</p></li>
<li><p>Comparable to published results</p></li>
<li><p>95%+ accuracy is achievable milestone</p></li>
</ul>
<p><strong>CIFAR-10 (for CNN validation):</strong></p>
<ul class="simple">
<li><p>Natural images (harder than digits)</p></li>
<li><p>RGB channels (multi-dimensional)</p></li>
<li><p>Standard CNN benchmark</p></li>
<li><p>75%+ with basic CNN proves it works</p></li>
</ul>
</section>
</section>
<hr class="docutils" />
<section id="accessing-datasets">
<h2>Accessing Datasets<a class="headerlink" href="#accessing-datasets" title="Permalink to this heading">#</a></h2>
<section id="for-students">
<h3>For Students<a class="headerlink" href="#for-students" title="Permalink to this heading">#</a></h3>
<p><strong>You dont need to manually download anything!</strong></p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Just run milestone scripts</span>
<span class="nb">cd</span><span class="w"> </span>milestones/03_1986_mlp
python<span class="w"> </span>01_rumelhart_tinydigits.py<span class="w"> </span><span class="c1"># Uses shipped TinyDigits</span>
python<span class="w"> </span>02_rumelhart_mnist.py<span class="w"> </span><span class="c1"># Auto-downloads MNIST if needed</span>
</pre></div>
</div>
<p>The milestones handle all data loading automatically.</p>
</section>
<section id="for-developers-researchers">
<h3>For Developers/Researchers<a class="headerlink" href="#for-developers-researchers" title="Permalink to this heading">#</a></h3>
<p><strong>Direct dataset access:</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Shipped datasets (always available)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">datasets.tinydigits</span><span class="w"> </span><span class="kn">import</span> <span class="n">load_tinydigits</span>
<span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">,</span> <span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span> <span class="o">=</span> <span class="n">load_tinydigits</span><span class="p">()</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">datasets.tinytalks</span><span class="w"> </span><span class="kn">import</span> <span class="n">load_tinytalks</span>
<span class="n">conversations</span> <span class="o">=</span> <span class="n">load_tinytalks</span><span class="p">()</span>
<span class="c1"># Downloaded datasets (through milestones)</span>
<span class="c1"># See milestones/data_manager.py for download utilities</span>
</pre></div>
</div>
</section>
</section>
<hr class="docutils" />
<section id="dataset-sizes-summary">
<h2>Dataset Sizes Summary<a class="headerlink" href="#dataset-sizes-summary" title="Permalink to this heading">#</a></h2>
<div class="pst-scrollable-table-container"><table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>Dataset</p></th>
<th class="head"><p>Size</p></th>
<th class="head"><p>Samples</p></th>
<th class="head"><p>Ships With Repo</p></th>
<th class="head"><p>Purpose</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>TinyDigits</p></td>
<td><p>310 KB</p></td>
<td><p>1,200</p></td>
<td><p>Yes</p></td>
<td><p>Fast MLP/CNN iteration</p></td>
</tr>
<tr class="row-odd"><td><p>TinyTalks</p></td>
<td><p>40 KB</p></td>
<td><p>350 pairs</p></td>
<td><p>Yes</p></td>
<td><p>Transformer learning</p></td>
</tr>
<tr class="row-even"><td><p>MNIST</p></td>
<td><p>10 MB</p></td>
<td><p>70,000</p></td>
<td><p>Downloads</p></td>
<td><p>MLP validation</p></td>
</tr>
<tr class="row-odd"><td><p>CIFAR-10</p></td>
<td><p>170 MB</p></td>
<td><p>60,000</p></td>
<td><p>Downloads</p></td>
<td><p>CNN validation</p></td>
</tr>
</tbody>
</table>
</div>
<p><strong>Total shipped</strong>: ~350 KB<br />
<strong>Total with benchmarks</strong>: ~180 MB</p>
</section>
<hr class="docutils" />
<section id="why-ship-with-repo-matters">
<h2>Why Ship-with-Repo Matters<a class="headerlink" href="#why-ship-with-repo-matters" title="Permalink to this heading">#</a></h2>
<div style="background: #e3f2fd; padding: 1.5rem; border-radius: 0.5rem; margin: 1.5rem 0;">
<p><strong>Traditional ML courses:</strong></p>
<ul class="simple">
<li><p>“Download MNIST (10 MB)”</p></li>
<li><p>“Download CIFAR-10 (170 MB)”</p></li>
<li><p>Wait for downloads before starting</p></li>
<li><p>Large files in Git (bad practice)</p></li>
</ul>
<p><strong>TinyTorch approach:</strong></p>
<ul class="simple">
<li><p>Clone repo → Immediately start learning</p></li>
<li><p>Train first model in under 1 minute</p></li>
<li><p>Full benchmarks download only when scaling</p></li>
<li><p>Git repo stays small and fast</p></li>
</ul>
<p><strong>Educational benefit</strong>: Students see working models within minutes, not hours.</p>
</div>
</section>
<hr class="docutils" />
<section id="frequently-asked-questions">
<h2>Frequently Asked Questions<a class="headerlink" href="#frequently-asked-questions" title="Permalink to this heading">#</a></h2>
<p><strong>Q: Why not use full MNIST from the start?</strong><br />
A: TinyDigits trains 100× faster, enabling rapid iteration during learning. MNIST validates your complete implementation later.</p>
<p><strong>Q: Can I use my own datasets?</strong><br />
A: Absolutely! TinyTorch is a real framework—add your data loading code just like PyTorch.</p>
<p><strong>Q: Why ship datasets in Git?</strong><br />
A: 350 KB is negligible (smaller than many images), and it enables offline learning with instant iteration.</p>
<p><strong>Q: Where does CIFAR-10 download from?</strong><br />
A: Official sources via <code class="docutils literal notranslate"><span class="pre">milestones/data_manager.py</span></code>, with integrity verification.</p>
<p><strong>Q: Can I skip the large downloads?</strong><br />
A: Yes! You can work through most milestones using only shipped datasets. Downloaded datasets are for validation milestones.</p>
</section>
<hr class="docutils" />
<section id="related-documentation">
<h2>Related Documentation<a class="headerlink" href="#related-documentation" title="Permalink to this heading">#</a></h2>
<ul class="simple">
<li><p><a class="reference internal" href="chapters/milestones.html"><span class="doc std std-doc">Milestones Guide</span></a> - See how each dataset is used in historical achievements</p></li>
<li><p><a class="reference internal" href="student-workflow.html"><span class="doc std std-doc">Student Workflow</span></a> - Learn the development cycle</p></li>
<li><p><a class="reference internal" href="quickstart-guide.html"><span class="doc std std-doc">Quick Start</span></a> - Start building in 15 minutes</p></li>
</ul>
<p><strong>Dataset implementation details</strong>: See <code class="docutils literal notranslate"><span class="pre">datasets/tinydigits/README.md</span></code> and <code class="docutils literal notranslate"><span class="pre">datasets/tinytalks/README.md</span></code> for technical specifications.</p>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./."
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="tito/troubleshooting.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Troubleshooting Guide</p>
</div>
</a>
<a class="right-next"
href="community.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Community Ecosystem</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#design-philosophy">Design Philosophy</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#shipped-datasets-included-with-tinytorch">Shipped Datasets (Included with TinyTorch)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tinydigits-handwritten-digit-recognition">TinyDigits - Handwritten Digit Recognition</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tinytalks-conversational-q-a-dataset">TinyTalks - Conversational Q&amp;A Dataset</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#downloaded-datasets-auto-downloaded-on-demand">Downloaded Datasets (Auto-Downloaded On-Demand)</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mnist-handwritten-digit-classification">MNIST - Handwritten Digit Classification</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#cifar-10-natural-image-classification">CIFAR-10 - Natural Image Classification</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#dataset-selection-rationale">Dataset Selection Rationale</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#why-these-specific-datasets">Why These Specific Datasets?</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#accessing-datasets">Accessing Datasets</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#for-students">For Students</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#for-developers-researchers">For Developers/Researchers</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#dataset-sizes-summary">Dataset Sizes Summary</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-ship-with-repo-matters">Why Ship-with-Repo Matters</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#frequently-asked-questions">Frequently Asked Questions</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#related-documentation">Related Documentation</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2025.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>