mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-24 03:46:18 -05:00
949 lines
60 KiB
HTML
949 lines
60 KiB
HTML
|
||
<!DOCTYPE html>
|
||
|
||
|
||
<html lang="en" data-content_root="../" >
|
||
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<title>08. DataLoader — Tiny🔥Torch</title>
|
||
|
||
|
||
|
||
<script data-cfasync="false">
|
||
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
|
||
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
|
||
</script>
|
||
|
||
<!-- Loaded before other Sphinx assets -->
|
||
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
|
||
|
||
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
|
||
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=009d37f4" />
|
||
|
||
<!-- Pre-loaded scripts that we'll load fully later -->
|
||
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
|
||
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
|
||
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
|
||
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
|
||
<script src="../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
|
||
<script src="../_static/copybutton.js?v=f281be69"></script>
|
||
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
|
||
<script>let toggleHintShow = 'Click to show';</script>
|
||
<script>let toggleHintHide = 'Click to hide';</script>
|
||
<script>let toggleOpenOnPrint = 'true';</script>
|
||
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
|
||
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
|
||
<script src="../_static/design-tabs.js?v=f930bc37"></script>
|
||
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
|
||
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
|
||
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
|
||
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
|
||
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/08_dataloader_ABOUT';</script>
|
||
<script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
|
||
<script src="../_static/wip-banner.js?v=04a7e74d"></script>
|
||
<script src="../_static/marimo-badges.js?v=e6289128"></script>
|
||
<script src="../_static/sidebar-link.js?v=404b701b"></script>
|
||
<script src="../_static/hero-carousel.js?v=10341d2a"></script>
|
||
<script src="../_static/subscribe-modal.js?v=42919b64"></script>
|
||
<link rel="icon" href="../_static/favicon.svg"/>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
<link rel="next" title="09. Spatial Operations" href="09_spatial_ABOUT.html" />
|
||
<link rel="prev" title="🏛️ Architecture Tier (Modules 08-13)" href="../tiers/architecture.html" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||
<meta name="docsearch:language" content="en"/>
|
||
</head>
|
||
|
||
|
||
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
|
||
|
||
|
||
|
||
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
|
||
|
||
<div id="pst-scroll-pixel-helper"></div>
|
||
|
||
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
|
||
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
|
||
|
||
|
||
<input type="checkbox"
|
||
class="sidebar-toggle"
|
||
id="pst-primary-sidebar-checkbox"/>
|
||
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
|
||
|
||
<input type="checkbox"
|
||
class="sidebar-toggle"
|
||
id="pst-secondary-sidebar-checkbox"/>
|
||
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
|
||
|
||
<div class="search-button__wrapper">
|
||
<div class="search-button__overlay"></div>
|
||
<div class="search-button__search-container">
|
||
<form class="bd-search d-flex align-items-center"
|
||
action="../search.html"
|
||
method="get">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<input type="search"
|
||
class="form-control"
|
||
name="q"
|
||
id="search-input"
|
||
placeholder="Search..."
|
||
aria-label="Search..."
|
||
autocomplete="off"
|
||
autocorrect="off"
|
||
autocapitalize="off"
|
||
spellcheck="false"/>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
|
||
</form></div>
|
||
</div>
|
||
|
||
<div class="pst-async-banner-revealer d-none">
|
||
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
|
||
</div>
|
||
|
||
|
||
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
|
||
</header>
|
||
|
||
|
||
<div class="bd-container">
|
||
<div class="bd-container__inner bd-page-width">
|
||
|
||
|
||
|
||
<div class="bd-sidebar-primary bd-sidebar">
|
||
|
||
|
||
|
||
<div class="sidebar-header-items sidebar-primary__section">
|
||
|
||
|
||
|
||
|
||
</div>
|
||
|
||
<div class="sidebar-primary-items__start sidebar-primary__section">
|
||
<div class="sidebar-primary-item">
|
||
|
||
|
||
|
||
|
||
|
||
<a class="navbar-brand logo" href="../intro.html">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
|
||
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
|
||
|
||
|
||
</a></div>
|
||
<div class="sidebar-primary-item">
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<span class="search-button__default-text">Search</span>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||
</button>
|
||
`);
|
||
</script></div>
|
||
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
|
||
<div class="bd-toc-item navbar-nav active">
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
|
||
<ul class="current nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
|
||
<li class="toctree-l1 current active"><a class="current reference internal" href="#">08. DataLoader</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites & Resources</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress & Data</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits & Acknowledgments</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</nav></div>
|
||
</div>
|
||
|
||
|
||
<div class="sidebar-primary-items__end sidebar-primary__section">
|
||
</div>
|
||
|
||
<div id="rtd-footer-container"></div>
|
||
|
||
|
||
</div>
|
||
|
||
<main id="main-content" class="bd-main" role="main">
|
||
|
||
|
||
|
||
<div class="sbt-scroll-pixel-helper"></div>
|
||
|
||
<div class="bd-content">
|
||
<div class="bd-article-container">
|
||
|
||
<div class="bd-header-article d-print-none">
|
||
<div class="header-article-items header-article__inner">
|
||
|
||
<div class="header-article-items__start">
|
||
|
||
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<span class="fa-solid fa-bars"></span>
|
||
</button></div>
|
||
|
||
</div>
|
||
|
||
|
||
<div class="header-article-items__end">
|
||
|
||
<div class="header-article-item">
|
||
|
||
<div class="article-header-buttons">
|
||
|
||
|
||
|
||
|
||
|
||
<div class="dropdown dropdown-download-buttons">
|
||
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
|
||
<i class="fas fa-download"></i>
|
||
</button>
|
||
<ul class="dropdown-menu">
|
||
|
||
|
||
|
||
<li><a href="../_sources/modules/08_dataloader_ABOUT.md" target="_blank"
|
||
class="btn btn-sm btn-download-source-button dropdown-item"
|
||
title="Download source file"
|
||
data-bs-placement="left" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-file"></i>
|
||
</span>
|
||
<span class="btn__text-container">.md</span>
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
<li>
|
||
<button onclick="window.print()"
|
||
class="btn btn-sm btn-download-pdf-button dropdown-item"
|
||
title="Print to PDF"
|
||
data-bs-placement="left" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-file-pdf"></i>
|
||
</span>
|
||
<span class="btn__text-container">.pdf</span>
|
||
</button>
|
||
</li>
|
||
|
||
</ul>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<button onclick="toggleFullScreen()"
|
||
class="btn btn-sm btn-fullscreen-button"
|
||
title="Fullscreen mode"
|
||
data-bs-placement="bottom" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-expand"></i>
|
||
</span>
|
||
|
||
</button>
|
||
|
||
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
|
||
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
|
||
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
|
||
</button>
|
||
`);
|
||
</script>
|
||
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
|
||
</button>
|
||
`);
|
||
</script>
|
||
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<span class="fa-solid fa-list"></span>
|
||
</button>
|
||
</div></div>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div id="jb-print-docs-body" class="onlyprint">
|
||
<h1>08. DataLoader</h1>
|
||
<!-- Table of contents -->
|
||
<div id="print-main-content">
|
||
<div id="jb-print-toc">
|
||
|
||
<div>
|
||
<h2> Contents </h2>
|
||
</div>
|
||
<nav aria-label="Page">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#dataset-abstraction">Dataset Abstraction</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensordataset-implementation">TensorDataset Implementation</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#dataloader-with-batching-and-shuffling">DataLoader with Batching and Shuffling</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing & Validation</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#data-pipeline-theory">Data Pipeline Theory</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div id="searchbox"></div>
|
||
<article class="bd-article">
|
||
|
||
<section id="dataloader">
|
||
<h1>08. DataLoader<a class="headerlink" href="#dataloader" title="Link to this heading">#</a></h1>
|
||
<p><strong>ARCHITECTURE TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 4-5 hours</p>
|
||
<section id="overview">
|
||
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
|
||
<p>This module implements the data loading infrastructure that powers neural network training at scale. You’ll build the Dataset/DataLoader abstraction pattern used by PyTorch, TensorFlow, and every major ML framework—implementing batching, shuffling, and memory-efficient iteration from first principles. This is where data engineering meets systems thinking.</p>
|
||
</section>
|
||
<section id="learning-objectives">
|
||
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
|
||
<p>By the end of this module, you will be able to:</p>
|
||
<ul class="simple">
|
||
<li><p><strong>Design Dataset Abstractions</strong>: Implement the protocol-based interface (<code class="docutils literal notranslate"><span class="pre">__getitem__</span></code>, <code class="docutils literal notranslate"><span class="pre">__len__</span></code>) that separates data storage from data access</p></li>
|
||
<li><p><strong>Build Efficient DataLoaders</strong>: Create batching and shuffling mechanisms that stream data without loading entire datasets into memory</p></li>
|
||
<li><p><strong>Master Iterator Patterns</strong>: Understand how Python’s <code class="docutils literal notranslate"><span class="pre">for</span></code> loops work under the hood and implement custom iterators</p></li>
|
||
<li><p><strong>Optimize Data Pipelines</strong>: Analyze throughput bottlenecks and balance batch size against memory constraints</p></li>
|
||
<li><p><strong>Apply to Real Datasets</strong>: Use your DataLoader with actual image datasets like MNIST and CIFAR-10 in milestone projects</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="build-use-optimize">
|
||
<h2>Build → Use → Optimize<a class="headerlink" href="#build-use-optimize" title="Link to this heading">#</a></h2>
|
||
<p>This module follows TinyTorch’s <strong>Build → Use → Optimize</strong> framework:</p>
|
||
<ol class="arabic simple">
|
||
<li><p><strong>Build</strong>: Implement Dataset abstraction, TensorDataset for in-memory data, and DataLoader with batching/shuffling</p></li>
|
||
<li><p><strong>Use</strong>: Load synthetic datasets, create train/validation splits, and integrate with training loops</p></li>
|
||
<li><p><strong>Optimize</strong>: Profile throughput, analyze memory scaling, and measure shuffle overhead</p></li>
|
||
</ol>
|
||
</section>
|
||
<section id="implementation-guide">
|
||
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
|
||
<section id="dataset-abstraction">
|
||
<h3>Dataset Abstraction<a class="headerlink" href="#dataset-abstraction" title="Link to this heading">#</a></h3>
|
||
<p>The foundation of all data loading—a protocol-based interface for accessing samples:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">abc</span><span class="w"> </span><span class="kn">import</span> <span class="n">ABC</span><span class="p">,</span> <span class="n">abstractmethod</span>
|
||
|
||
<span class="k">class</span><span class="w"> </span><span class="nc">Dataset</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Abstract base class defining the dataset interface.</span>
|
||
|
||
<span class="sd"> All datasets must implement:</span>
|
||
<span class="sd"> - __len__(): Return total number of samples</span>
|
||
<span class="sd"> - __getitem__(idx): Return sample at given index</span>
|
||
|
||
<span class="sd"> This enables Pythonic usage:</span>
|
||
<span class="sd"> len(dataset) # How many samples?</span>
|
||
<span class="sd"> dataset[42] # Get sample 42</span>
|
||
<span class="sd"> for x in dataset # Iterate over all samples</span>
|
||
<span class="sd"> """</span>
|
||
|
||
<span class="nd">@abstractmethod</span>
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""Return total number of samples in dataset."""</span>
|
||
<span class="k">pass</span>
|
||
|
||
<span class="nd">@abstractmethod</span>
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Return sample at given index."""</span>
|
||
<span class="k">pass</span>
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>Why This Design:</strong></p>
|
||
<ul class="simple">
|
||
<li><p><strong>Protocol-based</strong>: Uses Python’s <code class="docutils literal notranslate"><span class="pre">__len__</span></code> and <code class="docutils literal notranslate"><span class="pre">__getitem__</span></code> for natural syntax</p></li>
|
||
<li><p><strong>Framework-agnostic</strong>: Same pattern used by PyTorch, TensorFlow, JAX</p></li>
|
||
<li><p><strong>Separation of concerns</strong>: Decouples <em>what data exists</em> from <em>how to load it</em></p></li>
|
||
<li><p><strong>Enables optimization</strong>: Makes caching, prefetching, and parallel loading possible</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="tensordataset-implementation">
|
||
<h3>TensorDataset Implementation<a class="headerlink" href="#tensordataset-implementation" title="Link to this heading">#</a></h3>
|
||
<p>When your data fits in memory, TensorDataset provides efficient access:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">TensorDataset</span><span class="p">(</span><span class="n">Dataset</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Dataset for in-memory tensors.</span>
|
||
|
||
<span class="sd"> Wraps multiple tensors with aligned first dimension:</span>
|
||
<span class="sd"> features: (N, feature_dim)</span>
|
||
<span class="sd"> labels: (N,)</span>
|
||
|
||
<span class="sd"> Returns tuple of tensors for each sample:</span>
|
||
<span class="sd"> dataset[i] → (features[i], labels[i])</span>
|
||
<span class="sd"> """</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">tensors</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Store tensors, validate first dimension alignment."""</span>
|
||
<span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">tensors</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span>
|
||
<span class="n">first_size</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">tensors</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>
|
||
<span class="k">for</span> <span class="n">tensor</span> <span class="ow">in</span> <span class="n">tensors</span><span class="p">:</span>
|
||
<span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span><span class="p">)</span> <span class="o">==</span> <span class="n">first_size</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">tensors</span> <span class="o">=</span> <span class="n">tensors</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||
<span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tensors</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
|
||
<span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">Tensor</span><span class="p">(</span><span class="n">t</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">idx</span><span class="p">])</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">tensors</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>Key Features:</strong></p>
|
||
<ul class="simple">
|
||
<li><p><strong>Memory locality</strong>: All data pre-loaded for fast access</p></li>
|
||
<li><p><strong>Vectorized operations</strong>: No conversion overhead during training</p></li>
|
||
<li><p><strong>Flexible</strong>: Handles any number of aligned tensors (features, labels, metadata)</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="dataloader-with-batching-and-shuffling">
|
||
<h3>DataLoader with Batching and Shuffling<a class="headerlink" href="#dataloader-with-batching-and-shuffling" title="Link to this heading">#</a></h3>
|
||
<p>The core engine that transforms samples into training-ready batches:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">DataLoader</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Efficient batch loader with shuffling support.</span>
|
||
|
||
<span class="sd"> Transforms:</span>
|
||
<span class="sd"> Individual samples → Batched tensors</span>
|
||
|
||
<span class="sd"> Features:</span>
|
||
<span class="sd"> - Automatic batching with configurable batch_size</span>
|
||
<span class="sd"> - Optional shuffling for training randomization</span>
|
||
<span class="sd"> - Memory-efficient iteration (one batch at a time)</span>
|
||
<span class="sd"> - Handles uneven final batch automatically</span>
|
||
<span class="sd"> """</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">:</span> <span class="n">Dataset</span><span class="p">,</span> <span class="n">batch_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">shuffle</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">dataset</span> <span class="o">=</span> <span class="n">dataset</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">batch_size</span> <span class="o">=</span> <span class="n">batch_size</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">shuffle</span> <span class="o">=</span> <span class="n">shuffle</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""Return number of batches per epoch."""</span>
|
||
<span class="k">return</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dataset</span><span class="p">)</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">batch_size</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">//</span> <span class="bp">self</span><span class="o">.</span><span class="n">batch_size</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="fm">__iter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Yield batches of data.</span>
|
||
|
||
<span class="sd"> Algorithm:</span>
|
||
<span class="sd"> 1. Generate indices [0, 1, ..., N-1]</span>
|
||
<span class="sd"> 2. Shuffle indices if requested</span>
|
||
<span class="sd"> 3. Group into chunks of batch_size</span>
|
||
<span class="sd"> 4. Load samples and collate into batch tensors</span>
|
||
<span class="sd"> 5. Yield each batch</span>
|
||
<span class="sd"> """</span>
|
||
<span class="n">indices</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dataset</span><span class="p">)))</span>
|
||
|
||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">shuffle</span><span class="p">:</span>
|
||
<span class="n">random</span><span class="o">.</span><span class="n">shuffle</span><span class="p">(</span><span class="n">indices</span><span class="p">)</span>
|
||
|
||
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">indices</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">batch_size</span><span class="p">):</span>
|
||
<span class="n">batch_indices</span> <span class="o">=</span> <span class="n">indices</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">batch_size</span><span class="p">]</span>
|
||
<span class="n">batch</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">dataset</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="k">for</span> <span class="n">idx</span> <span class="ow">in</span> <span class="n">batch_indices</span><span class="p">]</span>
|
||
<span class="k">yield</span> <span class="bp">self</span><span class="o">.</span><span class="n">_collate_batch</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span>
|
||
|
||
<span class="k">def</span><span class="w"> </span><span class="nf">_collate_batch</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">batch</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""Stack individual samples into batch tensors."""</span>
|
||
<span class="n">num_tensors</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">batch</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
|
||
<span class="n">batched_tensors</span> <span class="o">=</span> <span class="p">[]</span>
|
||
|
||
<span class="k">for</span> <span class="n">tensor_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_tensors</span><span class="p">):</span>
|
||
<span class="n">tensor_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">sample</span><span class="p">[</span><span class="n">tensor_idx</span><span class="p">]</span><span class="o">.</span><span class="n">data</span> <span class="k">for</span> <span class="n">sample</span> <span class="ow">in</span> <span class="n">batch</span><span class="p">]</span>
|
||
<span class="n">batched_data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">stack</span><span class="p">(</span><span class="n">tensor_list</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
|
||
<span class="n">batched_tensors</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">Tensor</span><span class="p">(</span><span class="n">batched_data</span><span class="p">))</span>
|
||
|
||
<span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">batched_tensors</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>The Batching Transformation:</strong></p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Individual Samples (from Dataset):
|
||
dataset[0] → (features: [1, 2, 3], label: 0)
|
||
dataset[1] → (features: [4, 5, 6], label: 1)
|
||
dataset[2] → (features: [7, 8, 9], label: 0)
|
||
|
||
DataLoader Batching (batch_size=2):
|
||
Batch 1:
|
||
features: [[1, 2, 3], ← Shape: (2, 3)
|
||
[4, 5, 6]]
|
||
labels: [0, 1] ← Shape: (2,)
|
||
|
||
Batch 2:
|
||
features: [[7, 8, 9]] ← Shape: (1, 3) [last batch]
|
||
labels: [0] ← Shape: (1,)
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="getting-started">
|
||
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
|
||
<section id="prerequisites">
|
||
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
|
||
<p>Ensure you understand the foundations:</p>
|
||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
|
||
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch
|
||
|
||
<span class="c1"># Verify prerequisite modules</span>
|
||
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>tensor
|
||
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>layers
|
||
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>training
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>Required Knowledge:</strong></p>
|
||
<ul class="simple">
|
||
<li><p>Tensor operations and NumPy arrays (Module 01)</p></li>
|
||
<li><p>Neural network basics (Modules 03-04)</p></li>
|
||
<li><p>Training loop structure (Module 07)</p></li>
|
||
<li><p>Python protocols (<code class="docutils literal notranslate"><span class="pre">__getitem__</span></code>, <code class="docutils literal notranslate"><span class="pre">__len__</span></code>, <code class="docutils literal notranslate"><span class="pre">__iter__</span></code>)</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="development-workflow">
|
||
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
|
||
<ol class="arabic simple">
|
||
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/08_dataloader/dataloader.py</span></code></p></li>
|
||
<li><p><strong>Implement Dataset abstraction</strong>: Define abstract base class with <code class="docutils literal notranslate"><span class="pre">__len__</span></code> and <code class="docutils literal notranslate"><span class="pre">__getitem__</span></code></p></li>
|
||
<li><p><strong>Build TensorDataset</strong>: Create concrete implementation for tensor-based data</p></li>
|
||
<li><p><strong>Create DataLoader</strong>: Implement batching, shuffling, and iterator protocol</p></li>
|
||
<li><p><strong>Test integration</strong>: Verify with training workflow simulation</p></li>
|
||
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">08</span> <span class="pre">&&</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">dataloader</span></code></p></li>
|
||
</ol>
|
||
</section>
|
||
</section>
|
||
<section id="testing">
|
||
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
|
||
<section id="comprehensive-test-suite">
|
||
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
|
||
<p>Run the full test suite to verify DataLoader functionality:</p>
|
||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
|
||
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>dataloader
|
||
|
||
<span class="c1"># Direct pytest execution</span>
|
||
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>dataloader<span class="w"> </span>-v
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="test-coverage-areas">
|
||
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
|
||
<ul class="simple">
|
||
<li><p>✅ <strong>Dataset Interface</strong>: Abstract base class enforcement, protocol implementation</p></li>
|
||
<li><p>✅ <strong>TensorDataset</strong>: Tensor alignment validation, indexing correctness</p></li>
|
||
<li><p>✅ <strong>DataLoader Batching</strong>: Batch shape consistency, handling uneven final batch</p></li>
|
||
<li><p>✅ <strong>Shuffling</strong>: Randomization correctness, deterministic seeding</p></li>
|
||
<li><p>✅ <strong>Training Integration</strong>: Complete workflow with train/validation splits</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="inline-testing-validation">
|
||
<h3>Inline Testing & Validation<a class="headerlink" href="#inline-testing-validation" title="Link to this heading">#</a></h3>
|
||
<p>The module includes comprehensive unit tests:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Run inline tests during development</span>
|
||
<span class="n">python</span> <span class="n">modules</span><span class="o">/</span><span class="mi">08</span><span class="n">_dataloader</span><span class="o">/</span><span class="n">dataloader</span><span class="o">.</span><span class="n">py</span>
|
||
|
||
<span class="c1"># Expected output:</span>
|
||
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Dataset</span> <span class="n">Abstract</span> <span class="n">Base</span> <span class="n">Class</span><span class="o">...</span>
|
||
<span class="err">✅</span> <span class="n">Dataset</span> <span class="ow">is</span> <span class="n">properly</span> <span class="n">abstract</span>
|
||
<span class="err">✅</span> <span class="n">Dataset</span> <span class="n">interface</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
|
||
|
||
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">TensorDataset</span><span class="o">...</span>
|
||
<span class="err">✅</span> <span class="n">TensorDataset</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
|
||
|
||
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">DataLoader</span><span class="o">...</span>
|
||
<span class="err">✅</span> <span class="n">DataLoader</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
|
||
|
||
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">DataLoader</span> <span class="n">Deterministic</span> <span class="n">Shuffling</span><span class="o">...</span>
|
||
<span class="err">✅</span> <span class="n">Deterministic</span> <span class="n">shuffling</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
|
||
|
||
<span class="err">🔬</span> <span class="n">Integration</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Training</span> <span class="n">Workflow</span><span class="o">...</span>
|
||
<span class="err">✅</span> <span class="n">Training</span> <span class="n">integration</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="manual-testing-examples">
|
||
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.tensor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tensor</span>
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.dataloader</span><span class="w"> </span><span class="kn">import</span> <span class="n">TensorDataset</span><span class="p">,</span> <span class="n">DataLoader</span>
|
||
|
||
<span class="c1"># Create synthetic dataset</span>
|
||
<span class="n">features</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">]])</span>
|
||
<span class="n">labels</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">])</span>
|
||
<span class="n">dataset</span> <span class="o">=</span> <span class="n">TensorDataset</span><span class="p">(</span><span class="n">features</span><span class="p">,</span> <span class="n">labels</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Create DataLoader with batching</span>
|
||
<span class="n">loader</span> <span class="o">=</span> <span class="n">DataLoader</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="n">batch_size</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">shuffle</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Iterate through batches</span>
|
||
<span class="k">for</span> <span class="n">batch_features</span><span class="p">,</span> <span class="n">batch_labels</span> <span class="ow">in</span> <span class="n">loader</span><span class="p">:</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Batch features shape: </span><span class="si">{</span><span class="n">batch_features</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Batch labels shape: </span><span class="si">{</span><span class="n">batch_labels</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<span class="c1"># Output: (2, 2) and (2,)</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="systems-thinking-questions">
|
||
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
|
||
<section id="real-world-applications">
|
||
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Link to this heading">#</a></h3>
|
||
<ul class="simple">
|
||
<li><p><strong>Image Classification</strong>: How would you design a DataLoader for ImageNet (1.2M images, 150GB)? What if the dataset doesn’t fit in RAM?</p></li>
|
||
<li><p><strong>Language Modeling</strong>: LLM training streams billions of tokens—how does batch size affect memory and throughput for variable-length sequences?</p></li>
|
||
<li><p><strong>Autonomous Vehicles</strong>: Tesla trains on terabytes of sensor data—how would you handle multi-modal data (camera + LIDAR + GPS) in a DataLoader?</p></li>
|
||
<li><p><strong>Medical Imaging</strong>: 3D CT scans are too large for GPU memory—what batching strategy would you use for patch extraction?</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="performance-characteristics">
|
||
<h3>Performance Characteristics<a class="headerlink" href="#performance-characteristics" title="Link to this heading">#</a></h3>
|
||
<ul class="simple">
|
||
<li><p><strong>Memory Scaling</strong>: Why does doubling batch size double memory usage? What memory components scale with batch size (activations, gradients, optimizer states)?</p></li>
|
||
<li><p><strong>Throughput Bottleneck</strong>: Your GPU can process 1000 images/sec but disk reads at 100 images/sec—where’s the bottleneck? How would you diagnose this?</p></li>
|
||
<li><p><strong>Shuffle Overhead</strong>: Does shuffling slow down training? Measure the overhead and explain when it becomes significant.</p></li>
|
||
<li><p><strong>Batch Size Trade-off</strong>: What’s the optimal batch size for training ResNet-50 on a 16GB GPU? How would you find it systematically?</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="data-pipeline-theory">
|
||
<h3>Data Pipeline Theory<a class="headerlink" href="#data-pipeline-theory" title="Link to this heading">#</a></h3>
|
||
<ul class="simple">
|
||
<li><p><strong>Iterator Protocol</strong>: How does Python’s <code class="docutils literal notranslate"><span class="pre">for</span></code> loop work under the hood? What methods must an object implement to be iterable?</p></li>
|
||
<li><p><strong>Memory Efficiency</strong>: Why can DataLoader handle datasets larger than RAM? What design pattern enables this?</p></li>
|
||
<li><p><strong>Collation Strategy</strong>: Why do we stack individual samples into batch tensors? What happens if we don’t?</p></li>
|
||
<li><p><strong>Shuffling Impact</strong>: How does shuffling affect gradient estimates and convergence? What happens if you forget to shuffle training data?</p></li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
<section id="ready-to-build">
|
||
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
|
||
<p>You’re about to implement the data loading infrastructure that powers modern AI systems. Understanding how to build efficient, scalable data pipelines is critical for production ML engineering—this isn’t just plumbing, it’s a first-class systems problem with dedicated engineering teams at major AI labs.</p>
|
||
<p>Every production training system depends on robust data loaders. Your implementation will follow the exact patterns used by PyTorch’s <code class="docutils literal notranslate"><span class="pre">torch.utils.data.DataLoader</span></code> and TensorFlow’s <code class="docutils literal notranslate"><span class="pre">tf.data.Dataset</span></code>—the same code running at Meta, Tesla, OpenAI, and every major ML organization.</p>
|
||
<p>Open <code class="docutils literal notranslate"><span class="pre">/Users/VJ/GitHub/TinyTorch/modules/08_dataloader/dataloader.py</span></code> and start building. Take your time with each component, run the inline tests frequently, and think deeply about the memory and throughput trade-offs you’re making.</p>
|
||
<p>Choose your preferred way to engage with this module:</p>
|
||
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
|
||
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
|
||
<div class="sd-col sd-d-flex-row docutils">
|
||
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
|
||
<div class="sd-card-body docutils">
|
||
<div class="sd-card-title sd-font-weight-bold docutils">
|
||
🚀 Launch Binder</div>
|
||
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
|
||
</div>
|
||
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/08_dataloader/dataloader_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/08_dataloader/dataloader_dev.ipynb</span></a></div>
|
||
</div>
|
||
<div class="sd-col sd-d-flex-row docutils">
|
||
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
|
||
<div class="sd-card-body docutils">
|
||
<div class="sd-card-title sd-font-weight-bold docutils">
|
||
⚡ Open in Colab</div>
|
||
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
|
||
</div>
|
||
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/08_dataloader/dataloader_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/08_dataloader/dataloader_dev.ipynb</span></a></div>
|
||
</div>
|
||
<div class="sd-col sd-d-flex-row docutils">
|
||
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
|
||
<div class="sd-card-body docutils">
|
||
<div class="sd-card-title sd-font-weight-bold docutils">
|
||
📖 View Source</div>
|
||
<p class="sd-card-text">Browse the Python source code and understand the implementation.</p>
|
||
</div>
|
||
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/08_dataloader/dataloader.py"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/08_dataloader/dataloader.py</span></a></div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<div class="tip admonition">
|
||
<p class="admonition-title">💾 Save Your Progress</p>
|
||
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
|
||
<p><strong>After completing this module</strong>, you’ll apply your DataLoader to real datasets in the milestone projects:</p>
|
||
<ul class="simple">
|
||
<li><p><strong>Milestone 03</strong>: Train MLP on MNIST handwritten digits (28×28 images)</p></li>
|
||
<li><p><strong>Milestone 04</strong>: Train CNN on CIFAR-10 natural images (32×32×3 images)</p></li>
|
||
</ul>
|
||
<p>These milestones include download utilities and preprocessing for production datasets.</p>
|
||
</div>
|
||
<hr class="docutils" />
|
||
<div class="prev-next-area">
|
||
<a class="left-prev" href="../chapters/07_training.html" title="previous page">← Previous Module: Training</a>
|
||
<a class="right-next" href="../chapters/09_spatial.html" title="next page">Next Module: Spatial (CNNs) →</a>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
|
||
<script type="text/x-thebe-config">
|
||
{
|
||
requestKernel: true,
|
||
binderOptions: {
|
||
repo: "binder-examples/jupyter-stacks-datascience",
|
||
ref: "master",
|
||
},
|
||
codeMirrorConfig: {
|
||
theme: "abcdef",
|
||
mode: "python"
|
||
},
|
||
kernelOptions: {
|
||
name: "python3",
|
||
path: "./modules"
|
||
},
|
||
predefinedOutput: true
|
||
}
|
||
</script>
|
||
<script>kernelName = 'python3'</script>
|
||
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<footer class="prev-next-footer d-print-none">
|
||
|
||
<div class="prev-next-area">
|
||
<a class="left-prev"
|
||
href="../tiers/architecture.html"
|
||
title="previous page">
|
||
<i class="fa-solid fa-angle-left"></i>
|
||
<div class="prev-next-info">
|
||
<p class="prev-next-subtitle">previous</p>
|
||
<p class="prev-next-title">🏛️ Architecture Tier (Modules 08-13)</p>
|
||
</div>
|
||
</a>
|
||
<a class="right-next"
|
||
href="09_spatial_ABOUT.html"
|
||
title="next page">
|
||
<div class="prev-next-info">
|
||
<p class="prev-next-subtitle">next</p>
|
||
<p class="prev-next-title">09. Spatial Operations</p>
|
||
</div>
|
||
<i class="fa-solid fa-angle-right"></i>
|
||
</a>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
|
||
|
||
|
||
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
|
||
|
||
|
||
<div class="sidebar-secondary-item">
|
||
<div class="page-toc tocsection onthispage">
|
||
<i class="fa-solid fa-list"></i> Contents
|
||
</div>
|
||
<nav class="bd-toc-nav page-toc">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#dataset-abstraction">Dataset Abstraction</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensordataset-implementation">TensorDataset Implementation</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#dataloader-with-batching-and-shuffling">DataLoader with Batching and Shuffling</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing & Validation</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#data-pipeline-theory">Data Pipeline Theory</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
|
||
</ul>
|
||
</nav></div>
|
||
|
||
</div></div>
|
||
|
||
|
||
</div>
|
||
<footer class="bd-footer-content">
|
||
|
||
<div class="bd-footer-content__inner container">
|
||
|
||
<div class="footer-item">
|
||
|
||
<p class="component-author">
|
||
By Prof. Vijay Janapa Reddi (Harvard University)
|
||
</p>
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
|
||
<p class="copyright">
|
||
|
||
© Copyright 2025.
|
||
<br/>
|
||
|
||
</p>
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</footer>
|
||
|
||
|
||
</main>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Scripts loaded after <body> so the DOM is not blocked -->
|
||
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
|
||
<footer class="bd-footer">
|
||
</footer>
|
||
</body>
|
||
</html> |