Files
TinyTorch/modules/09_spatial_ABOUT.html
2025-12-05 00:52:38 +00:00

1403 lines
96 KiB
HTML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>09. Spatial Operations &#8212; Tiny🔥Torch</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=009d37f4" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=f281be69"></script>
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="../_static/design-tabs.js?v=f930bc37"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script type="module" src="https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs"></script>
<script type="module" src="https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.2.0/dist/mermaid-layout-elk.esm.min.mjs"></script>
<script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";import elkLayouts from "https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.2.0/dist/mermaid-layout-elk.esm.min.mjs";mermaid.registerLayoutLoaders(elkLayouts);mermaid.initialize({startOnLoad:false});</script>
<script src="https://cdn.jsdelivr.net/npm/d3@7.9.0/dist/d3.min.js"></script>
<script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";
const defaultStyle = document.createElement('style');
defaultStyle.textContent = `pre.mermaid {
/* Same as .mermaid-container > pre */
display: block;
width: 100%;
}
pre.mermaid > svg {
/* Same as .mermaid-container > pre > svg */
height: 500px;
width: 100%;
max-width: 100% !important;
}
`;
document.head.appendChild(defaultStyle);
const fullscreenStyle = document.createElement('style');
fullscreenStyle.textContent = `.mermaid-container {
display: flex;
flex-direction: row;
width: 100%;
}
.mermaid-container > pre {
display: block;
width: 100%;
}
.mermaid-container > pre > svg {
height: 500px;
width: 100%;
max-width: 100% !important;
}
.mermaid-fullscreen-btn {
width: 28px;
height: 28px;
background: rgba(255, 255, 255, 0.95);
border: 1px solid rgba(0, 0, 0, 0.3);
border-radius: 4px;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
transition: all 0.2s;
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.2);
font-size: 14px;
line-height: 1;
padding: 0;
color: #333;
}
.mermaid-fullscreen-btn:hover {
opacity: 100% !important;
background: rgba(255, 255, 255, 1);
box-shadow: 0 3px 10px rgba(0, 0, 0, 0.3);
transform: scale(1.1);
}
.mermaid-fullscreen-btn.dark-theme {
background: rgba(50, 50, 50, 0.95);
border: 1px solid rgba(255, 255, 255, 0.3);
color: #e0e0e0;
}
.mermaid-fullscreen-btn.dark-theme:hover {
background: rgba(60, 60, 60, 1);
box-shadow: 0 3px 10px rgba(255, 255, 255, 0.2);
}
.mermaid-fullscreen-modal {
display: none;
position: fixed !important;
top: 0 !important;
left: 0 !important;
width: 95vw;
height: 100vh;
background: rgba(255, 255, 255, 0.98);
z-index: 9999;
padding: 20px;
overflow: auto;
}
.mermaid-fullscreen-modal.dark-theme {
background: rgba(0, 0, 0, 0.98);
}
.mermaid-fullscreen-modal.active {
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen {
position: relative;
width: 95vw;
height: 90vh;
max-width: 95vw;
max-height: 90vh;
background: white;
border-radius: 8px;
padding: 20px;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
overflow: auto;
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen.dark-theme {
background: #1a1a1a;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.8);
}
.mermaid-container-fullscreen pre.mermaid {
width: 100%;
height: 100%;
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen .mermaid svg {
height: 100% !important;
width: 100% !important;
cursor: grab;
}
.mermaid-fullscreen-close {
position: fixed !important;
top: 20px !important;
right: 20px !important;
width: 40px;
height: 40px;
background: rgba(255, 255, 255, 0.95);
border: 1px solid rgba(0, 0, 0, 0.2);
border-radius: 50%;
cursor: pointer;
z-index: 10000;
display: flex;
align-items: center;
justify-content: center;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
transition: all 0.2s;
font-size: 24px;
line-height: 1;
color: #333;
}
.mermaid-fullscreen-close:hover {
background: white;
box-shadow: 0 6px 16px rgba(0, 0, 0, 0.4);
transform: scale(1.1);
}
.mermaid-fullscreen-close.dark-theme {
background: rgba(50, 50, 50, 0.95);
border: 1px solid rgba(255, 255, 255, 0.2);
color: #e0e0e0;
}
.mermaid-fullscreen-close.dark-theme:hover {
background: rgba(60, 60, 60, 1);
box-shadow: 0 6px 16px rgba(255, 255, 255, 0.2);
}
.mermaid-fullscreen-modal .mermaid-fullscreen-btn {
display: none !important;
}`;
document.head.appendChild(fullscreenStyle);
// Detect if page has dark background
const isDarkTheme = () => {
const bgColor = window.getComputedStyle(document.body).backgroundColor;
const match = bgColor.match(/rgb\((\d+),\s*(\d+),\s*(\d+)/);
if (match) {
const r = parseInt(match[1]);
const g = parseInt(match[2]);
const b = parseInt(match[3]);
const brightness = (r * 299 + g * 587 + b * 114) / 1000;
return brightness < 128;
}
return false;
};
const load = async () => {
await mermaid.run();
const all_mermaids = document.querySelectorAll(".mermaid");
const mermaids_processed = document.querySelectorAll(".mermaid[data-processed='true']");
if ("False" === "True") {
const mermaids_to_add_zoom = -1 === -1 ? all_mermaids.length : -1;
if(mermaids_to_add_zoom > 0) {
var svgs = d3.selectAll("");
if(all_mermaids.length !== mermaids_processed.length) {
setTimeout(load, 200);
return;
} else if(svgs.size() !== mermaids_to_add_zoom) {
setTimeout(load, 200);
return;
} else {
svgs.each(function() {
var svg = d3.select(this);
svg.html("<g class='wrapper'>" + svg.html() + "</g>");
var inner = svg.select("g");
var zoom = d3.zoom().on("zoom", function(event) {
inner.attr("transform", event.transform);
});
svg.call(zoom);
});
}
}
} else if(all_mermaids.length !== mermaids_processed.length) {
// Wait for mermaid to process all diagrams
setTimeout(load, 200);
return;
}
const darkTheme = isDarkTheme();
// Stop here if not adding fullscreen capability
if ("True" !== "True") return;
const modal = document.createElement('div');
modal.className = 'mermaid-fullscreen-modal' + (darkTheme ? ' dark-theme' : '');
modal.setAttribute('role', 'dialog');
modal.setAttribute('aria-modal', 'true');
modal.setAttribute('aria-label', 'Fullscreen diagram viewer');
modal.innerHTML = `
<button class="mermaid-fullscreen-close${darkTheme ? ' dark-theme' : ''}" aria-label="Close fullscreen">✕</button>
<div class="mermaid-container-fullscreen${darkTheme ? ' dark-theme' : ''}"></div>
`;
document.body.appendChild(modal);
const modalContent = modal.querySelector('.mermaid-container-fullscreen');
const closeBtn = modal.querySelector('.mermaid-fullscreen-close');
let previousScrollOffset = [window.scrollX, window.scrollY];
const closeModal = () => {
modal.classList.remove('active');
modalContent.innerHTML = '';
document.body.style.overflow = ''
window.scrollTo({left: previousScrollOffset[0], top: previousScrollOffset[1], behavior: 'instant'});
};
closeBtn.addEventListener('click', closeModal);
modal.addEventListener('click', (e) => {
if (e.target === modal) closeModal();
});
document.addEventListener('keydown', (e) => {
if (e.key === 'Escape' && modal.classList.contains('active')) {
closeModal();
}
});
const allButtons = [];
document.querySelectorAll('.mermaid').forEach((mermaidDiv) => {
if (mermaidDiv.parentNode.classList.contains('mermaid-container') ||
mermaidDiv.closest('.mermaid-fullscreen-modal')) {
return;
}
const container = document.createElement('div');
container.className = 'mermaid-container';
mermaidDiv.parentNode.insertBefore(container, mermaidDiv);
container.appendChild(mermaidDiv);
const fullscreenBtn = document.createElement('button');
fullscreenBtn.className = 'mermaid-fullscreen-btn' + (darkTheme ? ' dark-theme' : '');
fullscreenBtn.setAttribute('aria-label', 'View diagram in fullscreen');
fullscreenBtn.textContent = '⛶';
fullscreenBtn.style.opacity = '50%';
// Calculate dynamic position based on diagram's margin and padding
const diagramStyle = window.getComputedStyle(mermaidDiv);
const marginTop = parseFloat(diagramStyle.marginTop) || 0;
const marginRight = parseFloat(diagramStyle.marginRight) || 0;
const paddingTop = parseFloat(diagramStyle.paddingTop) || 0;
const paddingRight = parseFloat(diagramStyle.paddingRight) || 0;
fullscreenBtn.style.top = `${marginTop + paddingTop + 4}px`;
fullscreenBtn.style.right = `${marginRight + paddingRight + 4}px`;
fullscreenBtn.addEventListener('click', () => {
previousScrollOffset = [window.scroll, window.scrollY];
const clone = mermaidDiv.cloneNode(true);
modalContent.innerHTML = '';
modalContent.appendChild(clone);
const svg = clone.querySelector('svg');
if (svg) {
svg.removeAttribute('width');
svg.removeAttribute('height');
svg.style.width = '100%';
svg.style.height = 'auto';
svg.style.maxWidth = '100%';
svg.style.sdisplay = 'block';
if ("False" === "True") {
setTimeout(() => {
const g = svg.querySelector('g');
if (g) {
var svgD3 = d3.select(svg);
svgD3.html("<g class='wrapper'>" + svgD3.html() + "</g>");
var inner = svgD3.select("g");
var zoom = d3.zoom().on("zoom", function(event) {
inner.attr("transform", event.transform);
});
svgD3.call(zoom);
}
}, 100);
}
}
modal.classList.add('active');
document.body.style.overflow = 'hidden';
});
container.appendChild(fullscreenBtn);
allButtons.push(fullscreenBtn);
});
// Update theme classes when theme changes
const updateTheme = () => {
const dark = isDarkTheme();
allButtons.forEach(btn => {
if (dark) {
btn.classList.add('dark-theme');
} else {
btn.classList.remove('dark-theme');
}
});
if (dark) {
modal.classList.add('dark-theme');
modalContent.classList.add('dark-theme');
closeBtn.classList.add('dark-theme');
} else {
modal.classList.remove('dark-theme');
modalContent.classList.remove('dark-theme');
closeBtn.classList.remove('dark-theme');
}
};
// Watch for theme changes
const observer = new MutationObserver(updateTheme);
observer.observe(document.documentElement, {
attributes: true,
attributeFilter: ['class', 'style', 'data-theme']
});
observer.observe(document.body, {
attributes: true,
attributeFilter: ['class', 'style']
});
};
window.addEventListener("load", load);
</script>
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/09_spatial_ABOUT';</script>
<script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
<script src="../_static/wip-banner.js?v=04a7e74d"></script>
<script src="../_static/marimo-badges.js?v=e6289128"></script>
<script src="../_static/sidebar-link.js?v=404b701b"></script>
<script src="../_static/hero-carousel.js?v=10341d2a"></script>
<script src="../_static/subscribe-modal.js?v=42919b64"></script>
<link rel="icon" href="../_static/favicon.svg"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="10. Tokenization - Text to Numerical Sequences" href="10_tokenization_ABOUT.html" />
<link rel="prev" title="08. DataLoader" href="08_dataloader_ABOUT.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search..."
aria-label="Search..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../intro.html">
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../_sources/modules/09_spatial_ABOUT.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>09. Spatial Operations</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-reflect">Build → Use → Reflect</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#convolutional-pipeline-flow">Convolutional Pipeline Flow</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#conv2d-layer-the-heart-of-computer-vision">Conv2d Layer - The Heart of Computer Vision</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#maxpool2d-spatial-downsampling-and-translation-invariance">MaxPool2d - Spatial Downsampling and Translation Invariance</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#simplecnn-complete-architecture">SimpleCNN - Complete Architecture</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#foundations">Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#characteristics">Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="spatial-operations">
<h1>09. Spatial Operations<a class="headerlink" href="#spatial-operations" title="Link to this heading">#</a></h1>
<p><strong>ARCHITECTURE TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 6-8 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>Implement convolutional neural networks (CNNs) from scratch, building the spatial operations that transformed computer vision from hand-crafted features to learned hierarchical representations. Youll discover why weight sharing revolutionizes computer vision by reducing parameters from millions to thousands while achieving superior spatial reasoning that powers everything from image classification to autonomous driving. This module teaches you how Conv2d achieves massive parameter reduction through weight sharing while enabling the spatial structure understanding critical for modern vision systems.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Implement Conv2d Forward Pass</strong>: Build sliding window convolution with explicit loops showing O(B×C_out×H×W××C_in) complexity, understanding how weight sharing applies the same learned filter across all spatial positions to detect features like edges and textures</p></li>
<li><p><strong>Master Weight Sharing Mechanics</strong>: Understand how Conv2d(3→32, kernel=3) uses only 896 parameters while a dense layer for the same 32×32 input needs 32,000 parameters—achieving 35× parameter reduction while preserving spatial structure</p></li>
<li><p><strong>Design Hierarchical Feature Extractors</strong>: Compose Conv → ReLU → Pool blocks into CNN architectures, learning how depth enables complex feature hierarchies from simple local operations (edges → textures → objects)</p></li>
<li><p><strong>Build Pooling Operations</strong>: Implement MaxPool2d and AvgPool2d for spatial downsampling, understanding the trade-off between spatial resolution and computational efficiency (4× memory reduction per 2×2 pooling layer)</p></li>
<li><p><strong>Analyze Receptive Field Growth</strong>: Master how stacked 3×3 convolutions build global context from local operations—two Conv2d layers see 5×5 regions, three layers see 7×7, enabling deep networks to detect large-scale patterns</p></li>
</ul>
</section>
<section id="build-use-reflect">
<h2>Build → Use → Reflect<a class="headerlink" href="#build-use-reflect" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorchs <strong>Build → Use → Reflect</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement Conv2d with explicit sliding window loops to expose computational complexity, create MaxPool2d and AvgPool2d for spatial downsampling, and build Flatten operations connecting spatial and dense layers for complete CNN architectures</p></li>
<li><p><strong>Use</strong>: Train CNNs on CIFAR-10 (60K 32×32 color images) to achieve &gt;75% accuracy, visualize learned feature maps showing edges in early layers and complex patterns in deep layers, and compare CNN vs MLP parameter efficiency on spatial data</p></li>
<li><p><strong>Reflect</strong>: Analyze why weight sharing reduces parameters by 35-1000× while improving spatial reasoning, how stacked 3×3 convolutions build global context from local receptive fields, and what memory-computation trade-offs exist between large kernels vs deep stacking</p></li>
</ol>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="convolutional-pipeline-flow">
<h3>Convolutional Pipeline Flow<a class="headerlink" href="#convolutional-pipeline-flow" title="Link to this heading">#</a></h3>
<p>Convolution transforms spatial data through learnable filters, pooling, and hierarchical feature extraction:</p>
<pre class="mermaid">
graph LR
A[Input Image&lt;br/&gt;H×W×C] --&gt; B[Conv2d&lt;br/&gt;k×k filters]
B --&gt; C[Feature Maps&lt;br/&gt;H'×W'×F]
C --&gt; D[Activation&lt;br/&gt;ReLU]
D --&gt; E[Pool 2×2&lt;br/&gt;Downsample]
E --&gt; F[Output&lt;br/&gt;H'/2×W'/2×F]
style A fill:#e3f2fd
style B fill:#fff3e0
style C fill:#f3e5f5
style D fill:#ffe0b2
style E fill:#fce4ec
style F fill:#f0fdf4
</pre><p><strong>Flow</strong>: Image → Convolution (weight sharing) → Feature maps → Nonlinearity → Pooling → Downsampled features</p>
</section>
<section id="conv2d-layer-the-heart-of-computer-vision">
<h3>Conv2d Layer - The Heart of Computer Vision<a class="headerlink" href="#conv2d-layer-the-heart-of-computer-vision" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">Conv2d</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> 2D Convolutional layer with learnable filters and weight sharing.</span>
<span class="sd"> Implements sliding window convolution where the same learned filter</span>
<span class="sd"> applies across all spatial positions, achieving massive parameter</span>
<span class="sd"> reduction compared to dense layers while preserving spatial structure.</span>
<span class="sd"> Key Concepts:</span>
<span class="sd"> - Weight sharing: Same filter at all spatial positions</span>
<span class="sd"> - Local connectivity: Each output depends on local input region</span>
<span class="sd"> - Learnable filters: Each filter learns to detect different features</span>
<span class="sd"> - Translation invariance: Detected features independent of position</span>
<span class="sd"> Args:</span>
<span class="sd"> in_channels: Number of input channels (3 for RGB, 16 for feature maps)</span>
<span class="sd"> out_channels: Number of learned filters (feature detectors)</span>
<span class="sd"> kernel_size: Spatial size of sliding window (typically 3 or 5)</span>
<span class="sd"> stride: Step size when sliding (1 = no downsampling)</span>
<span class="sd"> padding: Border padding to preserve spatial dimensions</span>
<span class="sd"> Shape:</span>
<span class="sd"> Input: (batch, in_channels, height, width)</span>
<span class="sd"> Output: (batch, out_channels, out_height, out_width)</span>
<span class="sd"> Where: out_height = (height + 2*padding - kernel_size) // stride + 1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">in_channels</span><span class="p">,</span> <span class="n">out_channels</span><span class="p">,</span> <span class="n">kernel_size</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">padding</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span>
<span class="c1"># Initialize learnable filters: one per output channel</span>
<span class="c1"># Shape: (out_channels, in_channels, kernel_size, kernel_size)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">weight</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="n">out_channels</span><span class="p">,</span> <span class="n">in_channels</span><span class="p">,</span> <span class="n">kernel_size</span><span class="p">,</span> <span class="n">kernel_size</span><span class="p">))</span>
<span class="c1"># He initialization for ReLU networks</span>
<span class="n">fan_in</span> <span class="o">=</span> <span class="n">in_channels</span> <span class="o">*</span> <span class="n">kernel_size</span> <span class="o">*</span> <span class="n">kernel_size</span>
<span class="n">std</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="n">fan_in</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">std</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Apply sliding window convolution with explicit loops to show cost.&quot;&quot;&quot;</span>
<span class="n">batch</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">H</span><span class="p">,</span> <span class="n">W</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span>
<span class="n">out_h</span> <span class="o">=</span> <span class="p">(</span><span class="n">H</span> <span class="o">+</span> <span class="mi">2</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">padding</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span><span class="p">)</span> <span class="o">//</span> <span class="bp">self</span><span class="o">.</span><span class="n">stride</span> <span class="o">+</span> <span class="mi">1</span>
<span class="n">out_w</span> <span class="o">=</span> <span class="p">(</span><span class="n">W</span> <span class="o">+</span> <span class="mi">2</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">padding</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span><span class="p">)</span> <span class="o">//</span> <span class="bp">self</span><span class="o">.</span><span class="n">stride</span> <span class="o">+</span> <span class="mi">1</span>
<span class="c1"># Apply padding if needed</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">padding</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">pad</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">padding</span><span class="p">)</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="n">batch</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">out_channels</span><span class="p">,</span> <span class="n">out_h</span><span class="p">,</span> <span class="n">out_w</span><span class="p">))</span>
<span class="c1"># Explicit 7-nested loop showing O(B×C_out×H×W×K_h×K_w×C_in) complexity</span>
<span class="k">for</span> <span class="n">b</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">batch</span><span class="p">):</span>
<span class="k">for</span> <span class="n">oc</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">out_channels</span><span class="p">):</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">out_h</span><span class="p">):</span>
<span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">out_w</span><span class="p">):</span>
<span class="c1"># Extract local patch from input</span>
<span class="n">i_start</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">stride</span>
<span class="n">j_start</span> <span class="o">=</span> <span class="n">j</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">stride</span>
<span class="n">patch</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="p">:,</span> <span class="n">i_start</span><span class="p">:</span><span class="n">i_start</span><span class="o">+</span><span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span><span class="p">,</span>
<span class="n">j_start</span><span class="p">:</span><span class="n">j_start</span><span class="o">+</span><span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span><span class="p">]</span>
<span class="c1"># Convolution: dot product between filter and patch</span>
<span class="n">output</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">oc</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">patch</span><span class="o">.</span><span class="n">data</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">oc</span><span class="p">])</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
<span class="k">return</span> <span class="n">output</span>
</pre></div>
</div>
<p><strong>Why Explicit Loops Matter</strong>: Modern frameworks optimize convolution with im2col transformations and cuDNN kernels, achieving 10-100× speedups. But the explicit loops reveal where computational cost lives—helping you understand why kernel size matters enormously and why production systems carefully balance depth vs width.</p>
</section>
<section id="maxpool2d-spatial-downsampling-and-translation-invariance">
<h3>MaxPool2d - Spatial Downsampling and Translation Invariance<a class="headerlink" href="#maxpool2d-spatial-downsampling-and-translation-invariance" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">MaxPool2d</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Max pooling for spatial downsampling and translation invariance.</span>
<span class="sd"> Extracts maximum value from each local region, providing:</span>
<span class="sd"> - Spatial dimension reduction (4× memory reduction per 2×2 pooling)</span>
<span class="sd"> - Translation invariance (robustness to small shifts)</span>
<span class="sd"> - Feature importance selection (keep strongest activations)</span>
<span class="sd"> Args:</span>
<span class="sd"> kernel_size: Size of pooling window (typically 2)</span>
<span class="sd"> stride: Step size when sliding (defaults to kernel_size)</span>
<span class="sd"> Shape:</span>
<span class="sd"> Input: (batch, channels, height, width)</span>
<span class="sd"> Output: (batch, channels, out_height, out_width)</span>
<span class="sd"> Where: out_height = (height - kernel_size) // stride + 1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">kernel_size</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span> <span class="o">=</span> <span class="n">kernel_size</span>
<span class="bp">self</span><span class="o">.</span><span class="n">stride</span> <span class="o">=</span> <span class="n">stride</span> <span class="k">if</span> <span class="n">stride</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">kernel_size</span>
<span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Extract maximum value from each local region.&quot;&quot;&quot;</span>
<span class="n">batch</span><span class="p">,</span> <span class="n">channels</span><span class="p">,</span> <span class="n">H</span><span class="p">,</span> <span class="n">W</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span>
<span class="n">out_h</span> <span class="o">=</span> <span class="p">(</span><span class="n">H</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span><span class="p">)</span> <span class="o">//</span> <span class="bp">self</span><span class="o">.</span><span class="n">stride</span> <span class="o">+</span> <span class="mi">1</span>
<span class="n">out_w</span> <span class="o">=</span> <span class="p">(</span><span class="n">W</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span><span class="p">)</span> <span class="o">//</span> <span class="bp">self</span><span class="o">.</span><span class="n">stride</span> <span class="o">+</span> <span class="mi">1</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="n">batch</span><span class="p">,</span> <span class="n">channels</span><span class="p">,</span> <span class="n">out_h</span><span class="p">,</span> <span class="n">out_w</span><span class="p">))</span>
<span class="k">for</span> <span class="n">b</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">batch</span><span class="p">):</span>
<span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">channels</span><span class="p">):</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">out_h</span><span class="p">):</span>
<span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">out_w</span><span class="p">):</span>
<span class="n">i_start</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">stride</span>
<span class="n">j_start</span> <span class="o">=</span> <span class="n">j</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">stride</span>
<span class="n">patch</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">i_start</span><span class="p">:</span><span class="n">i_start</span><span class="o">+</span><span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span><span class="p">,</span>
<span class="n">j_start</span><span class="p">:</span><span class="n">j_start</span><span class="o">+</span><span class="bp">self</span><span class="o">.</span><span class="n">kernel_size</span><span class="p">]</span>
<span class="n">output</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">]</span> <span class="o">=</span> <span class="n">patch</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
<span class="k">return</span> <span class="n">output</span>
</pre></div>
</div>
<p><strong>MaxPool vs AvgPool</strong>: MaxPool preserves sharp features like edges (takes max activation), while AvgPool creates smoother features (averages the window). Production systems typically use MaxPool for feature extraction and Global Average Pooling for final classification layers.</p>
</section>
<section id="simplecnn-complete-architecture">
<h3>SimpleCNN - Complete Architecture<a class="headerlink" href="#simplecnn-complete-architecture" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">SimpleCNN</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Complete CNN for CIFAR-10 image classification.</span>
<span class="sd"> Architecture: Conv → ReLU → Pool → Conv → ReLU → Pool → Flatten → Dense</span>
<span class="sd"> Layer-by-layer transformation:</span>
<span class="sd"> Input: (B, 3, 32, 32) RGB images</span>
<span class="sd"> Conv1: (B, 32, 32, 32) - 32 filters detect edges/textures</span>
<span class="sd"> Pool1: (B, 32, 16, 16) - downsample by 2×</span>
<span class="sd"> Conv2: (B, 64, 16, 16) - 64 filters detect shapes/patterns</span>
<span class="sd"> Pool2: (B, 64, 8, 8) - downsample by 2×</span>
<span class="sd"> Flatten: (B, 4096) - convert spatial to vector</span>
<span class="sd"> Dense: (B, 10) - classify into 10 categories</span>
<span class="sd"> Parameters: ~500K (vs ~4M for equivalent dense network)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="c1"># Feature extraction backbone</span>
<span class="bp">self</span><span class="o">.</span><span class="n">conv1</span> <span class="o">=</span> <span class="n">Conv2d</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">32</span><span class="p">,</span> <span class="n">kernel_size</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">padding</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">pool1</span> <span class="o">=</span> <span class="n">MaxPool2d</span><span class="p">(</span><span class="n">kernel_size</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">conv2</span> <span class="o">=</span> <span class="n">Conv2d</span><span class="p">(</span><span class="mi">32</span><span class="p">,</span> <span class="mi">64</span><span class="p">,</span> <span class="n">kernel_size</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">padding</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">pool2</span> <span class="o">=</span> <span class="n">MaxPool2d</span><span class="p">(</span><span class="n">kernel_size</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="c1"># Classification head</span>
<span class="bp">self</span><span class="o">.</span><span class="n">flatten</span> <span class="o">=</span> <span class="n">Flatten</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">fc</span> <span class="o">=</span> <span class="n">Linear</span><span class="p">(</span><span class="mi">64</span> <span class="o">*</span> <span class="mi">8</span> <span class="o">*</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="c1"># Hierarchical feature extraction</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">pool1</span><span class="p">(</span><span class="n">relu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">conv1</span><span class="p">(</span><span class="n">x</span><span class="p">)))</span> <span class="c1"># (B, 32, 16, 16)</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">pool2</span><span class="p">(</span><span class="n">relu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">conv2</span><span class="p">(</span><span class="n">x</span><span class="p">)))</span> <span class="c1"># (B, 64, 8, 8)</span>
<span class="c1"># Classification</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">flatten</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="c1"># (B, 4096)</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">fc</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="c1"># (B, 10)</span>
<span class="k">return</span> <span class="n">x</span>
</pre></div>
</div>
<p><strong>Architecture Design Principles</strong>: This follows the standard CNN pattern—alternating Conv+ReLU (feature extraction) with Pooling (dimension reduction). Each Conv layer learns hierarchical features (Layer 1: edges → Layer 2: shapes), while pooling provides computational efficiency and translation invariance.</p>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure you understand the foundations from previous modules:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch
<span class="c1"># Verify prerequisite modules are complete</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>tensor<span class="w"> </span><span class="c1"># Module 01: Tensor operations</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>activations<span class="w"> </span><span class="c1"># Module 02: ReLU activation</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>layers<span class="w"> </span><span class="c1"># Module 03: Linear layers</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>dataloader<span class="w"> </span><span class="c1"># Module 08: Batch loading</span>
</pre></div>
</div>
<p><strong>Why These Prerequisites</strong>:</p>
<ul class="simple">
<li><p><strong>Tensor</strong>: Conv2d requires tensor indexing, reshaping, and broadcasting for sliding windows</p></li>
<li><p><strong>Activations</strong>: CNNs use ReLU after each convolution for non-linear feature learning</p></li>
<li><p><strong>Layers</strong>: Dense classification layers connect to CNN feature extraction</p></li>
<li><p><strong>DataLoader</strong>: CIFAR-10 training requires batch loading and data augmentation</p></li>
</ul>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/09_spatial/spatial_dev.py</span></code></p></li>
<li><p><strong>Implement Conv2d forward pass</strong>: Build sliding window convolution with explicit loops showing computational complexity</p></li>
<li><p><strong>Create MaxPool2d and AvgPool2d</strong>: Implement spatial downsampling with different aggregation strategies</p></li>
<li><p><strong>Build Flatten operation</strong>: Connect spatial feature maps to dense layers</p></li>
<li><p><strong>Design SimpleCNN architecture</strong>: Compose spatial and dense layers into complete CNN</p></li>
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">09</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">spatial</span></code></p></li>
</ol>
<p><strong>Development Tips</strong>:</p>
<ul class="simple">
<li><p>Start with small inputs (8×8 images) to debug convolution logic before scaling to 32×32</p></li>
<li><p>Print intermediate shapes at each layer to verify dimension calculations</p></li>
<li><p>Visualize feature maps after Conv layers to understand learned filters</p></li>
<li><p>Compare parameter counts: Conv2d(3→32, k=3) = 896 params vs Dense(3072→32) = 98,304 params</p></li>
</ul>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify spatial operation functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>spatial
<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>spatial<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Conv2d Shape Propagation</strong>: Verifies output dimensions match formula (H+2P-K)//S+1 for various kernel sizes, strides, and padding</p></li>
<li><p><strong>Weight Sharing Validation</strong>: Confirms same filter applies at all spatial positions, achieving parameter reduction vs dense layers</p></li>
<li><p><strong>Pooling Correctness</strong>: Tests MaxPool extracts maximum values and AvgPool computes correct averages across windows</p></li>
<li><p><strong>Translation Invariance</strong>: Verifies CNNs detect features regardless of spatial position through weight sharing</p></li>
<li><p><strong>Complete CNN Pipeline</strong>: End-to-end test processing CIFAR-10 images through Conv → Pool → Flatten → Dense architecture</p></li>
</ul>
</section>
<section id="inline-testing-validation">
<h3>Inline Testing &amp; Validation<a class="headerlink" href="#inline-testing-validation" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive inline tests during development:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Run inline unit tests</span>
<span class="n">cd</span> <span class="o">/</span><span class="n">Users</span><span class="o">/</span><span class="n">VJ</span><span class="o">/</span><span class="n">GitHub</span><span class="o">/</span><span class="n">TinyTorch</span><span class="o">/</span><span class="n">modules</span><span class="o">/</span><span class="mi">09</span><span class="n">_spatial</span>
<span class="n">python</span> <span class="n">spatial_dev</span><span class="o">.</span><span class="n">py</span>
<span class="c1"># Expected output:</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Conv2d</span><span class="o">...</span>
<span class="err"></span> <span class="n">Sliding</span> <span class="n">window</span> <span class="n">convolution</span> <span class="n">works</span> <span class="n">correctly</span>
<span class="err"></span> <span class="n">Weight</span> <span class="n">sharing</span> <span class="n">applied</span> <span class="n">at</span> <span class="nb">all</span> <span class="n">positions</span>
<span class="err"></span> <span class="n">Output</span> <span class="n">shape</span> <span class="n">matches</span> <span class="n">calculated</span> <span class="n">dimensions</span>
<span class="err"></span> <span class="n">Parameter</span> <span class="n">count</span><span class="p">:</span> <span class="mi">896</span> <span class="p">(</span><span class="n">vs</span> <span class="mi">32</span><span class="p">,</span><span class="mi">000</span> <span class="k">for</span> <span class="n">dense</span> <span class="n">layer</span><span class="p">)</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">Conv2d</span> <span class="n">forward</span> <span class="k">pass</span> <span class="n">implemented</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Pooling</span> <span class="n">Operations</span><span class="o">...</span>
<span class="err"></span> <span class="n">MaxPool2d</span> <span class="n">extracts</span> <span class="n">maximum</span> <span class="n">values</span> <span class="n">correctly</span>
<span class="err"></span> <span class="n">AvgPool2d</span> <span class="n">computes</span> <span class="n">averages</span> <span class="n">correctly</span>
<span class="err"></span> <span class="n">Spatial</span> <span class="n">dimensions</span> <span class="n">reduced</span> <span class="n">by</span> <span class="n">factor</span> <span class="n">of</span> <span class="n">kernel_size</span>
<span class="err"></span> <span class="n">Translation</span> <span class="n">invariance</span> <span class="nb">property</span> <span class="n">verified</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">Pooling</span> <span class="n">layers</span> <span class="n">implemented</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">SimpleCNN</span> <span class="n">Integration</span><span class="o">...</span>
<span class="err"></span> <span class="n">Forward</span> <span class="k">pass</span> <span class="n">through</span> <span class="nb">all</span> <span class="n">layers</span> <span class="n">successful</span>
<span class="err"></span> <span class="n">Output</span> <span class="n">shape</span><span class="p">:</span> <span class="p">(</span><span class="mi">32</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="k">for</span> <span class="mi">10</span> <span class="n">CIFAR</span><span class="o">-</span><span class="mi">10</span> <span class="n">classes</span>
<span class="err"></span> <span class="n">Total</span> <span class="n">parameters</span><span class="p">:</span> <span class="o">~</span><span class="mi">500</span><span class="n">K</span> <span class="p">(</span><span class="n">efficient</span><span class="err">!</span><span class="p">)</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">CNN</span> <span class="n">architecture</span> <span class="n">complete</span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<p>Test individual components interactively:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">spatial_dev</span><span class="w"> </span><span class="kn">import</span> <span class="n">Conv2d</span><span class="p">,</span> <span class="n">MaxPool2d</span><span class="p">,</span> <span class="n">SimpleCNN</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">np</span>
<span class="c1"># Test Conv2d with small input</span>
<span class="n">conv</span> <span class="o">=</span> <span class="n">Conv2d</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="n">kernel_size</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">padding</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">8</span><span class="p">))</span>
<span class="n">out</span> <span class="o">=</span> <span class="n">conv</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Conv2d output shape: </span><span class="si">{</span><span class="n">out</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span> <span class="c1"># (2, 16, 8, 8)</span>
<span class="c1"># Test MaxPool dimension reduction</span>
<span class="n">pool</span> <span class="o">=</span> <span class="n">MaxPool2d</span><span class="p">(</span><span class="n">kernel_size</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">pooled</span> <span class="o">=</span> <span class="n">pool</span><span class="p">(</span><span class="n">out</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;MaxPool output shape: </span><span class="si">{</span><span class="n">pooled</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span> <span class="c1"># (2, 16, 4, 4)</span>
<span class="c1"># Test complete CNN</span>
<span class="n">cnn</span> <span class="o">=</span> <span class="n">SimpleCNN</span><span class="p">(</span><span class="n">num_classes</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="n">img</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">32</span><span class="p">,</span> <span class="mi">32</span><span class="p">))</span>
<span class="n">logits</span> <span class="o">=</span> <span class="n">cnn</span><span class="p">(</span><span class="n">img</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;CNN output shape: </span><span class="si">{</span><span class="n">logits</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span> <span class="c1"># (4, 10)</span>
<span class="c1"># Count parameters</span>
<span class="n">params</span> <span class="o">=</span> <span class="n">cnn</span><span class="o">.</span><span class="n">parameters</span><span class="p">()</span>
<span class="n">total</span> <span class="o">=</span> <span class="nb">sum</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">prod</span><span class="p">(</span><span class="n">p</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">params</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Total parameters: </span><span class="si">{</span><span class="n">total</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span> <span class="c1"># ~500,000</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-applications">
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Link to this heading">#</a></h3>
<p><strong>Autonomous Driving - Tesla Autopilot</strong></p>
<p><strong>Challenge</strong>: Teslas Autopilot processes 8 cameras at 36 FPS with 1280×960 resolution, running CNN backbones to extract features for object detection, lane recognition, and depth estimation. The entire inference must complete in &lt;30ms for real-time control.</p>
<p><strong>Solution</strong>: Efficient CNN architectures (MobileNet-style depthwise separable convolutions) and aggressive optimization (TensorRT compilation, INT8 quantization) balance accuracy vs latency on embedded hardware (Tesla FSD computer: 144 TOPS).</p>
<p><strong>Your Implementation Connection</strong>: Understanding Conv2ds computational cost (K²×C_in×C_out×H×W operations) reveals why Tesla optimizes kernel sizes and channel counts carefully—every operation matters at 36 FPS × 8 cameras = 288 frames/second total processing.</p>
<p><strong>Medical Imaging - Diagnostic Assistance</strong></p>
<p><strong>Challenge</strong>: CNN systems analyze X-rays, CT scans, and pathology slides for diagnostic assistance. PathAIs breast cancer detection achieves 97% sensitivity (vs 92% for individual pathologists) by training deep CNNs on millions of annotated slides. Medical deployment requires interpretability—doctors need to understand why the CNN made a prediction.</p>
<p><strong>Solution</strong>: Visualizing intermediate feature maps and using attention mechanisms to highlight diagnostic regions. Grad-CAM (Gradient-weighted Class Activation Mapping) shows which spatial regions contributed most to the prediction.</p>
<p><strong>Your Implementation Connection</strong>: Your Conv2ds feature maps can be visualized showing which spatial regions activate strongly for different filters. This interpretability is crucial for medical deployment where “black box” predictions are insufficient for clinical decisions.</p>
<p><strong>Face Recognition - Apple Face ID</strong></p>
<p><strong>Challenge</strong>: Apples Face ID uses CNNs to generate face embeddings enabling secure device unlock with &lt;1 in 1,000,000 false accept rate. The entire pipeline (detection + alignment + embedding + matching) runs on-device in real-time. Privacy requires on-device processing, demanding lightweight CNN architectures.</p>
<p><strong>Solution</strong>: MobileNet-style CNNs with depthwise separable convolutions reduce parameters by 8-10× while maintaining accuracy. The entire model fits in &lt;10MB, enabling on-device execution protecting user privacy.</p>
<p><strong>Your Implementation Connection</strong>: Understanding Conv2ds parameter count (C_out×C_in×K²) reveals why face recognition systems carefully design CNN architectures—fewer parameters enable on-device deployment without sacrificing accuracy.</p>
<p><strong>Historical Impact - AlexNet to ResNet</strong></p>
<p><strong>LeNet-5 (1998)</strong>: Yann LeCuns CNN successfully read handwritten zip codes for the US Postal Service, establishing the Conv → Pool → Conv → Pool → Dense pattern your SimpleCNN follows. Training took days on CPUs, limiting practical deployment.</p>
<p><strong>AlexNet (2012)</strong>: Won ImageNet with 16% error (vs 26% for hand-crafted features), sparking the deep learning revolution. Key innovation: training deep CNNs on GPUs with massive datasets proved that scale + convolution = breakthrough performance.</p>
<p><strong>VGG (2014)</strong>: Demonstrated that deeper CNNs with simple 3×3 kernels outperform shallow networks with large kernels. Established that stacking many small convolutions beats few large ones—the computational trade-off analysis below.</p>
<p><strong>ResNet (2015)</strong>: 152-layer CNN achieved 3.6% ImageNet error (better than human 5% baseline) via skip connections solving vanishing gradients. Your Conv2d is the foundation—ResNet is “just” your layers with residual connections enabling extreme depth.</p>
</section>
<section id="foundations">
<h3>Foundations<a class="headerlink" href="#foundations" title="Link to this heading">#</a></h3>
<p><strong>Weight Sharing and Parameter Efficiency</strong></p>
<p><strong>Question</strong>: A Conv2d(3, 32, kernel_size=3) layer has 32 filters × (3 channels × 3×3 spatial) = 896 parameters. For a 32×32 RGB image, a dense layer producing 32 feature maps of the same resolution needs (3×32×32) × (32×32×32) = 3,072 × 32,768 = ~100 million parameters. Why does convolution reduce parameters by 100,000×? How does weight sharing enable this dramatic reduction? What spatial assumption does convolution make that dense layers dont—and when might this assumption break?</p>
<p><strong>Key Insights</strong>:</p>
<ul class="simple">
<li><p><strong>Weight Sharing</strong>: Conv2d applies the same 3×3×3 filter at all 32×32 = 1,024 positions, sharing 896 parameters across 1,024 locations. Dense layers learn independent weights for each position.</p></li>
<li><p><strong>Local Connectivity</strong>: Each conv output depends only on a local 3×3 neighborhood, not the entire image. This inductive bias reduces parameters but assumes nearby pixels are more related than distant ones.</p></li>
<li><p><strong>When It Breaks</strong>: For tasks where spatial relationships dont follow local patterns (e.g., finding relationships between distant objects), convolutions local connectivity limits expressiveness. This motivates attention mechanisms in Vision Transformers.</p></li>
</ul>
<p><strong>Translation Invariance Through Weight Sharing</strong></p>
<p><strong>Question</strong>: A CNN detects a cat regardless of whether it appears in the top-left or bottom-right corner of an image. A dense network trained on top-left cats fails on bottom-right cats. How does weight sharing enable translation invariance? Why does applying the same filter at all spatial positions make detected features position-independent? Whats the trade-off: what spatial information does convolution lose by treating all positions equally?</p>
<p><strong>Key Insights</strong>:</p>
<ul class="simple">
<li><p><strong>Same Filter Everywhere</strong>: Weight sharing means the “cat ear detector” filter slides across the entire image, detecting ears wherever they appear. Dense layers have position-specific weights that dont generalize spatially.</p></li>
<li><p><strong>Pooling Enhances Invariance</strong>: MaxPool further increases invariance—if the cat moves 1 pixel, the max in each 2×2 window often stays the same, making predictions robust to small shifts.</p></li>
<li><p><strong>Trade-off</strong>: Convolution loses absolute position information. For tasks requiring precise localization (e.g., object detection), networks must add position embeddings or specialized heads to recover spatial coordinates.</p></li>
</ul>
<p><strong>Hierarchical Feature Learning</strong></p>
<p><strong>Question</strong>: Early CNN layers (Conv1) learn to detect edges and simple textures. Deep layers (Conv5) detect complex objects like faces and cars. This feature hierarchy emerges automatically from stacking convolutions—its not explicitly programmed. How do stacked convolutions build hierarchical representations from local operations? Why dont deep dense networks show this hierarchical organization? What role does the receptive field (the input region affecting each output) play in hierarchical learning?</p>
<p><strong>Key Insights</strong>:</p>
<ul class="simple">
<li><p><strong>Receptive Field Growth</strong>: A single 3×3 conv sees 9 pixels. Two stacked 3×3 convs see 5×5 (25 pixels). Three see 7×7 (49 pixels). Deeper layers see larger input regions, enabling detection of larger patterns.</p></li>
<li><p><strong>Compositional Learning</strong>: Early layers learn simple features (edges). Middle layers combine edges into textures and corners. Deep layers combine textures into object parts (eyes, wheels), then complete objects.</p></li>
<li><p><strong>Why Dense Doesnt</strong>: Dense layers lack spatial structure—each neuron connects to all inputs equally. Without spatial inductive bias (local connectivity + weight sharing), dense networks dont naturally learn hierarchical spatial features.</p></li>
</ul>
</section>
<section id="characteristics">
<h3>Characteristics<a class="headerlink" href="#characteristics" title="Link to this heading">#</a></h3>
<p><strong>Receptive Field Growth and Global Context</strong></p>
<p><strong>Question</strong>: A single Conv2d(kernel_size=3) sees a 3×3 region. Two stacked Conv2d layers see a 5×5 region (center of second layer sees 3×3 of first layer, which each see 3×3 of input). Three layers see 7×7. How many Conv2d(kernel_size=3) layers are needed to see an entire 32×32 image? How do deep CNNs build global context from local operations? Whats the trade-off: why not use one large Conv2d(kernel_size=32) instead of stacking many small kernels?</p>
<p><strong>Key Insights</strong>:</p>
<ul class="simple">
<li><p><strong>Receptive Field Formula</strong>: For N layers with kernel size K, receptive field = 1 + N×(K-1). For K=3: RF = 1+2N. To cover 32×32 requires RF ≥ 32, so N ≥ 15.5 → need 16 Conv2d(3×3) layers.</p></li>
<li><p><strong>Stacking Benefits</strong>: Three Conv2d(3×3) layers have 3×(C²×9) = 27C² parameters and 3 ReLU nonlinearities. One Conv2d(7×7) has C²×49 parameters and 1 ReLU. Stacking provides parameter efficiency and more non-linear transformations for the same receptive field.</p></li>
<li><p><strong>Trade-off</strong>: Deeper stacking increases computational cost (more layers to process) and training difficulty (vanishing gradients). But gains from parameter efficiency and expressiveness typically outweigh costs—hence VGGs success with stacked 3×3 convs vs AlexNets large kernels.</p></li>
</ul>
<p><strong>Computational Cost and Optimization Strategies</strong></p>
<p><strong>Question</strong>: A Conv2d(64→64, kernel_size=7) has 64×64×7×7 = 200K parameters and processes (64×7×7) = 3,136 operations per output pixel. Three stacked Conv2d(64→64, kernel_size=3) have 3×(64×64×3×3) = 110K parameters but perform 3×(64×3×3) = 1,728 operations per output pixel at each of 3 layers. Which is better for parameter efficiency? For computational cost? For feature learning? Why did the field shift from AlexNets 11×11 kernels to VGG/ResNets 3×3 stacks?</p>
<p><strong>Key Insights</strong>:</p>
<ul class="simple">
<li><p><strong>Parameter Efficiency</strong>: Stacked 3×3 (110K params) beats single 7×7 (200K params) by 1.8×.</p></li>
<li><p><strong>Computational Cost</strong>: Stacked approach performs 3×1,728 = 5,184 ops per output pixel vs 3,136 for single 7×7. Stacking costs 1.65× more computation.</p></li>
<li><p><strong>Feature Learning</strong>: Stacking provides 3 ReLU nonlinearities vs 1, enabling more complex feature transformations. The expressiveness gain from depth outweighs the 1.65× compute cost.</p></li>
<li><p><strong>Modern Practice</strong>: VGG established that stacked 3×3 convs outperform large kernels. ResNet, EfficientNet, and modern architectures all use 3×3 (or 1×1 for channel mixing) due to better parameter-computation-expressiveness trade-off.</p></li>
</ul>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>Youre about to implement the spatial operations that revolutionized how machines see. Before deep learning, computer vision relied on hand-crafted features like SIFT and HOG—human experts manually designed algorithms to detect edges, corners, and textures. AlexNets 2012 ImageNet victory proved that learned convolutional features outperform hand-crafted ones, launching the deep learning revolution. Today, CNNs process billions of images daily across Metas photo tagging (2B photos/day), Teslas Autopilot (real-time multi-camera processing), and Google Photos (trillion+ image search).</p>
<p>The Conv2d operations youll implement arent just educational exercises—theyre the same patterns powering production vision systems. Your sliding window convolution reveals why kernel size matters enormously (7×7 kernels cost 5.4× more than 3×3) and why weight sharing enables CNNs to learn from spatial data 100× more efficiently than dense networks. The explicit loops expose computational costs that modern frameworks hide with im2col transformations and cuDNN kernels—understanding the naive implementation reveals where optimizations matter most.</p>
<p>By building CNNs from first principles, youll understand not just how convolution works, but why it works—why weight sharing provides translation invariance, how stacked small kernels build global context from local operations, and what memory-computation trade-offs govern architecture design. These insights prepare you to design efficient CNN architectures for resource-constrained deployment (mobile, edge devices) and to debug performance bottlenecks in production systems.</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
🚀 Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/09_spatial/spatial_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/09_spatial/spatial_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
⚡ Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/09_spatial/spatial_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/09_spatial/spatial_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
📖 View Source</div>
<p class="sd-card-text">Browse the Jupyter notebook source and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/09_spatial/spatial_dev.ipynb"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/09_spatial/spatial_dev.ipynb</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">💾 Save Your Progress</p>
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<p><strong>Local Development</strong>:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>/Users/VJ/GitHub/TinyTorch/modules/09_spatial
python<span class="w"> </span>spatial_dev.py<span class="w"> </span><span class="c1"># Run inline tests</span>
tito<span class="w"> </span>module<span class="w"> </span><span class="nb">complete</span><span class="w"> </span><span class="m">09</span><span class="w"> </span><span class="c1"># Export to package</span>
</pre></div>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../08_dataloader/ABOUT.html" title="previous page">← Module 08: DataLoader</a>
<a class="right-next" href="../10_tokenization/ABOUT.html" title="next page">Module 10: Tokenization →</a>
</div>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./modules"
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="08_dataloader_ABOUT.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">08. DataLoader</p>
</div>
</a>
<a class="right-next"
href="10_tokenization_ABOUT.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">10. Tokenization - Text to Numerical Sequences</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-reflect">Build → Use → Reflect</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#convolutional-pipeline-flow">Convolutional Pipeline Flow</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#conv2d-layer-the-heart-of-computer-vision">Conv2d Layer - The Heart of Computer Vision</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#maxpool2d-spatial-downsampling-and-translation-invariance">MaxPool2d - Spatial Downsampling and Translation Invariance</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#simplecnn-complete-architecture">SimpleCNN - Complete Architecture</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#foundations">Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#characteristics">Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2025.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>