TinyTorch/dev/modules/02_activations_ABOUT.html


<!DOCTYPE html>


<html lang="en" data-content_root="../" >

  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

    <title>02. Activations &#8212; Tiny🔥Torch</title>


  <script data-cfasync="false">
    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
  </script>

  <!-- Loaded before other Sphinx assets -->
  <link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />


  <link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
  <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />

    <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
    <link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
    <link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
    <link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
    <link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
    <link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
    <link rel="stylesheet" type="text/css" href="../_static/custom.css?v=afcf7c3c" />

  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
  <script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>

    <script src="../_static/documentation_options.js?v=9eb32ce0"></script>
    <script src="../_static/doctools.js?v=9a2dae69"></script>
    <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
    <script src="../_static/copybutton.js?v=f281be69"></script>
    <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
    <script>let toggleHintShow = 'Click to show';</script>
    <script>let toggleHintHide = 'Click to hide';</script>
    <script>let toggleOpenOnPrint = 'true';</script>
    <script src="../_static/togglebutton.js?v=4a39c7ea"></script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script src="../_static/design-tabs.js?v=f930bc37"></script>
    <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
    <script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
    <script>DOCUMENTATION_OPTIONS.pagename = 'modules/02_activations_ABOUT';</script>
    <script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
    <script src="../_static/wip-banner.js?v=5357532b"></script>
    <script src="../_static/marimo-badges.js?v=1e5d2842"></script>
    <script src="../_static/sidebar-link.js?v=404b701b"></script>
    <script src="../_static/hero-carousel.js?v=10341d2a"></script>
    <link rel="icon" href="../_static/favicon.svg"/>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="03. Layers" href="03_layers_ABOUT.html" />
    <link rel="prev" title="01. Tensor" href="01_tensor_ABOUT.html" />
  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
  </head>


  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">


  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>

  <div id="pst-scroll-pixel-helper"></div>

  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
    <i class="fa-solid fa-arrow-up"></i>Back to top</button>


  <input type="checkbox"
          class="sidebar-toggle"
          id="pst-primary-sidebar-checkbox"/>
  <label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>

  <input type="checkbox"
          class="sidebar-toggle"
          id="pst-secondary-sidebar-checkbox"/>
  <label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>

  <div class="search-button__wrapper">
    <div class="search-button__overlay"></div>
    <div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
      action="../search.html"
      method="get">
  <i class="fa-solid fa-magnifying-glass"></i>
  <input type="search"
         class="form-control"
         name="q"
         id="search-input"
         placeholder="Search..."
         aria-label="Search..."
         autocomplete="off"
         autocorrect="off"
         autocapitalize="off"
         spellcheck="false"/>
  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
  </div>

  <div class="pst-async-banner-revealer d-none">
  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>


    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
    </header>


  <div class="bd-container">
    <div class="bd-container__inner bd-page-width">


      <div class="bd-sidebar-primary bd-sidebar">


  <div class="sidebar-header-items sidebar-primary__section">


  </div>

    <div class="sidebar-primary-items__start sidebar-primary__section">
        <div class="sidebar-primary-item">


<a class="navbar-brand logo" href="../intro.html">


    <img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
    <script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>


</a></div>
        <div class="sidebar-primary-item">

 <script>
 document.write(`
   <button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass"></i>
    <span class="search-button__default-text">Search</span>
    <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
   </button>
 `);
 </script></div>
        <div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
    <div class="bd-toc-item navbar-nav active">
        <p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>

    </div>
</nav></div>
    </div>


  <div class="sidebar-primary-items__end sidebar-primary__section">
  </div>

  <div id="rtd-footer-container"></div>


      </div>

      <main id="main-content" class="bd-main" role="main">


<div class="sbt-scroll-pixel-helper"></div>

          <div class="bd-content">
            <div class="bd-article-container">

              <div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">

    <div class="header-article-items__start">

        <div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
  <span class="fa-solid fa-bars"></span>
</button></div>

    </div>


    <div class="header-article-items__end">

        <div class="header-article-item">

<div class="article-header-buttons">


<div class="dropdown dropdown-download-buttons">
  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
    <i class="fas fa-download"></i>
  </button>
  <ul class="dropdown-menu">


      <li><a href="../_sources/modules/02_activations_ABOUT.md" target="_blank"
   class="btn btn-sm btn-download-source-button dropdown-item"
   title="Download source file"
   data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-file"></i>
  </span>
<span class="btn__text-container">.md</span>
</a>
</li>


      <li>
<button onclick="window.print()"
  class="btn btn-sm btn-download-pdf-button dropdown-item"
  title="Print to PDF"
  data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-file-pdf"></i>
  </span>
<span class="btn__text-container">.pdf</span>
</button>
</li>

  </ul>
</div>


<button onclick="toggleFullScreen()"
  class="btn btn-sm btn-fullscreen-button"
  title="Fullscreen mode"
  data-bs-placement="bottom" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-expand"></i>
  </span>

</button>


<script>
document.write(`
  <button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
    <i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
    <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
  </button>
`);
</script>


<script>
document.write(`
  <button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass fa-lg"></i>
  </button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <span class="fa-solid fa-list"></span>
</button>
</div></div>

    </div>

</div>
</div>


<div id="jb-print-docs-body" class="onlyprint">
    <h1>02. Activations</h1>
    <!-- Table of contents -->
    <div id="print-main-content">
        <div id="jb-print-toc">

            <div>
                <h2> Contents </h2>
            </div>
            <nav aria-label="Page">
                <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-reflect">Build → Use → Reflect</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#relu-the-sparsity-creator">ReLU - The Sparsity Creator</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#sigmoid-the-probabilistic-gate">Sigmoid - The Probabilistic Gate</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tanh-the-zero-centered-alternative">Tanh - The Zero-Centered Alternative</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#gelu-the-smooth-modern-choice">GELU - The Smooth Modern Choice</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#softmax-the-probability-distributor">Softmax - The Probability Distributor</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mathematical-foundations">Mathematical Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
            </nav>
        </div>
    </div>
</div>


<div id="searchbox"></div>
                <article class="bd-article">

  <section id="activations">
<h1>02. Activations<a class="headerlink" href="#activations" title="Link to this heading">#</a></h1>
<p><strong>FOUNDATION TIER</strong> | Difficulty: ⭐⭐ (2/4) | Time: 3-4 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>Activation functions are the mathematical operations that introduce non-linearity into neural networks, transforming them from simple linear regressors into universal function approximators. Without activations, stacking layers would be pointless—multiple linear transformations collapse to a single linear operation. With activations, each layer learns increasingly complex representations, enabling networks to approximate any continuous function. This module implements five essential activation functions with proper numerical stability, preparing you to understand what happens every time you call <code class="docutils literal notranslate"><span class="pre">F.relu(x)</span></code> or <code class="docutils literal notranslate"><span class="pre">torch.sigmoid(x)</span></code> in production code.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Systems Understanding</strong>: Recognize activation functions as the critical non-linearity that enables universal function approximation, understanding their role in memory consumption (activation caching), computational bottlenecks (billions of calls per training run), and gradient flow through deep architectures</p></li>
<li><p><strong>Core Implementation</strong>: Build ReLU, Sigmoid, Tanh, GELU, and Softmax with numerical stability techniques (max subtraction, conditional computation) that prevent overflow/underflow while maintaining mathematical correctness</p></li>
<li><p><strong>Pattern Recognition</strong>: Understand function properties—ReLU’s sparsity and [0, ∞) range, Sigmoid’s (0,1) probabilistic outputs, Tanh’s (-1,1) zero-centered gradients, GELU’s smoothness, Softmax’s probability distributions—and why each serves specific architectural roles</p></li>
<li><p><strong>Framework Connection</strong>: See how your implementations mirror <code class="docutils literal notranslate"><span class="pre">torch.nn.ReLU</span></code>, <code class="docutils literal notranslate"><span class="pre">torch.nn.Sigmoid</span></code>, <code class="docutils literal notranslate"><span class="pre">torch.nn.Tanh</span></code>, <code class="docutils literal notranslate"><span class="pre">torch.nn.GELU</span></code>, and <code class="docutils literal notranslate"><span class="pre">F.softmax</span></code>, understanding the actual mathematical operations behind PyTorch’s abstractions used throughout ResNet, BERT, GPT, and vision transformers</p></li>
<li><p><strong>Performance Trade-offs</strong>: Analyze computational cost (element-wise operations vs exponentials), memory implications (activation caching for backprop), and gradient behavior (vanishing gradients in Sigmoid/Tanh vs ReLU’s constant gradients), understanding why ReLU dominates hidden layers while Sigmoid/Softmax serve specific output roles</p></li>
</ul>
</section>
<section id="build-use-reflect">
<h2>Build → Use → Reflect<a class="headerlink" href="#build-use-reflect" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorch’s <strong>Build → Use → Reflect</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement five core activation functions (ReLU, Sigmoid, Tanh, GELU, Softmax) with numerical stability. Handle overflow in exponentials through max subtraction and conditional computation, ensure shape preservation across operations, and maintain proper value ranges ([0,∞) for ReLU, (0,1) for Sigmoid, (-1,1) for Tanh, probability distributions for Softmax)</p></li>
<li><p><strong>Use</strong>: Apply activations to real tensors with various ranges and shapes. Test with extreme values (±1000) to verify numerical stability, visualize function behavior across input domains, integrate with Tensor operations from Module 01, and chain activations to simulate simple neural network data flow (Input → ReLU → Softmax)</p></li>
<li><p><strong>Reflect</strong>: Understand why each activation exists in production systems—why ReLU enables sparse representations (many zeros) that accelerate computation and reduce overfitting, how Sigmoid creates gates (0 to 1 control signals) in LSTM/GRU architectures, why Tanh’s zero-centered outputs improve optimization dynamics, how GELU’s smoothness helps transformers, and why Softmax’s probability distributions are essential for classification</p></li>
</ol>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="relu-the-sparsity-creator">
<h3>ReLU - The Sparsity Creator<a class="headerlink" href="#relu-the-sparsity-creator" title="Link to this heading">#</a></h3>
<p>ReLU (Rectified Linear Unit) is the workhorse of modern deep learning, used in hidden layers of ResNet, EfficientNet, and most convolutional architectures.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">ReLU</span><span class="p">:</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;ReLU activation: f(x) = max(0, x)&quot;&quot;&quot;</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="c1"># Zero negative values, preserve positive values</span>
        <span class="k">return</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">maximum</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">x</span><span class="o">.</span><span class="n">data</span><span class="p">))</span>
</pre></div>
</div>
<p><strong>Mathematical Definition</strong>: <code class="docutils literal notranslate"><span class="pre">f(x)</span> <span class="pre">=</span> <span class="pre">max(0,</span> <span class="pre">x)</span></code></p>
<p><strong>Key Properties</strong>:</p>
<ul class="simple">
<li><p><strong>Range</strong>: [0, ∞) - unbounded above</p></li>
<li><p><strong>Gradient</strong>: 0 for x &lt; 0, 1 for x &gt; 0 (undefined at x = 0)</p></li>
<li><p><strong>Sparsity</strong>: Produces many exact zeros (sparse activations)</p></li>
<li><p><strong>Computational Cost</strong>: Trivial (element-wise comparison)</p></li>
</ul>
<p><strong>Why ReLU Dominates Hidden Layers</strong>:</p>
<ul class="simple">
<li><p>No vanishing gradient problem (gradient is 1 for positive inputs)</p></li>
<li><p>Computationally efficient (simple max operation)</p></li>
<li><p>Creates sparsity (zeros) that reduces computation and helps regularization</p></li>
<li><p>Empirically outperforms Sigmoid/Tanh in deep networks</p></li>
</ul>
<p><strong>Watch Out For</strong>: “Dying ReLU” problem—neurons can get stuck outputting zero if inputs become consistently negative during training. Variants like Leaky ReLU (allows small negative slope) address this.</p>
</section>
<section id="sigmoid-the-probabilistic-gate">
<h3>Sigmoid - The Probabilistic Gate<a class="headerlink" href="#sigmoid-the-probabilistic-gate" title="Link to this heading">#</a></h3>
<p>Sigmoid maps any real number to (0, 1), making it essential for binary classification and gating mechanisms in LSTMs/GRUs.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">Sigmoid</span><span class="p">:</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;Sigmoid activation: σ(x) = 1/(1 + e^(-x))&quot;&quot;&quot;</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="c1"># Numerical stability: avoid exp() overflow</span>
        <span class="n">data</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">data</span>
        <span class="k">return</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">where</span><span class="p">(</span>
            <span class="n">data</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span>
            <span class="mi">1</span> <span class="o">/</span> <span class="p">(</span><span class="mi">1</span> <span class="o">+</span> <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="o">-</span><span class="n">data</span><span class="p">)),</span>           <span class="c1"># Positive values</span>
            <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> <span class="o">/</span> <span class="p">(</span><span class="mi">1</span> <span class="o">+</span> <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="n">data</span><span class="p">))</span>  <span class="c1"># Negative values</span>
        <span class="p">))</span>
</pre></div>
</div>
<p><strong>Mathematical Definition</strong>: <code class="docutils literal notranslate"><span class="pre">σ(x)</span> <span class="pre">=</span> <span class="pre">1/(1</span> <span class="pre">+</span> <span class="pre">e^(-x))</span></code></p>
<p><strong>Key Properties</strong>:</p>
<ul class="simple">
<li><p><strong>Range</strong>: (0, 1) - strictly bounded</p></li>
<li><p><strong>Gradient</strong>: σ(x)(1 - σ(x)), maximum 0.25 at x = 0</p></li>
<li><p><strong>Symmetry</strong>: σ(-x) = 1 - σ(x)</p></li>
<li><p><strong>Computational Cost</strong>: One exponential per element</p></li>
</ul>
<p><strong>Numerical Stability Critical</strong>:</p>
<ul class="simple">
<li><p>Naive <code class="docutils literal notranslate"><span class="pre">1/(1</span> <span class="pre">+</span> <span class="pre">exp(-x))</span></code> overflows for large positive x</p></li>
<li><p>For x ≥ 0: use <code class="docutils literal notranslate"><span class="pre">1/(1</span> <span class="pre">+</span> <span class="pre">exp(-x))</span></code> (stable)</p></li>
<li><p>For x &lt; 0: use <code class="docutils literal notranslate"><span class="pre">exp(x)/(1</span> <span class="pre">+</span> <span class="pre">exp(x))</span></code> (stable)</p></li>
<li><p>Conditional computation prevents overflow while maintaining correctness</p></li>
</ul>
<p><strong>Production Use Cases</strong>:</p>
<ul class="simple">
<li><p>Binary classification output layer (probability of positive class)</p></li>
<li><p>LSTM/GRU gates (input gate, forget gate, output gate)</p></li>
<li><p>Attention mechanisms (before softmax normalization)</p></li>
</ul>
<p><strong>Gradient Problem</strong>: Maximum derivative is 0.25, meaning gradients shrink by ≥75% per layer. In deep networks (&gt;10 layers), gradients vanish exponentially, making training difficult. This is why ReLU replaced Sigmoid in hidden layers.</p>
</section>
<section id="tanh-the-zero-centered-alternative">
<h3>Tanh - The Zero-Centered Alternative<a class="headerlink" href="#tanh-the-zero-centered-alternative" title="Link to this heading">#</a></h3>
<p>Tanh (hyperbolic tangent) maps inputs to (-1, 1), providing zero-centered outputs that improve gradient flow compared to Sigmoid.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">Tanh</span><span class="p">:</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;Tanh activation: f(x) = (e^x - e^(-x))/(e^x + e^(-x))&quot;&quot;&quot;</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="k">return</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">tanh</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">data</span><span class="p">))</span>
</pre></div>
</div>
<p><strong>Mathematical Definition</strong>: <code class="docutils literal notranslate"><span class="pre">tanh(x)</span> <span class="pre">=</span> <span class="pre">(e^x</span> <span class="pre">-</span> <span class="pre">e^(-x))/(e^x</span> <span class="pre">+</span> <span class="pre">e^(-x))</span></code></p>
<p><strong>Key Properties</strong>:</p>
<ul class="simple">
<li><p><strong>Range</strong>: (-1, 1) - symmetric around zero</p></li>
<li><p><strong>Gradient</strong>: 1 - tanh²(x), maximum 1.0 at x = 0</p></li>
<li><p><strong>Symmetry</strong>: tanh(-x) = -tanh(x) (odd function)</p></li>
<li><p><strong>Computational Cost</strong>: Two exponentials (or NumPy optimized)</p></li>
</ul>
<p><strong>Why Zero-Centered Matters</strong>:</p>
<ul class="simple">
<li><p>Tanh outputs have mean ≈ 0, unlike Sigmoid’s mean ≈ 0.5</p></li>
<li><p>Gradients don’t systematically bias weight updates in one direction</p></li>
<li><p>Helps optimization in shallow networks and RNN cells</p></li>
</ul>
<p><strong>Production Use Cases</strong>:</p>
<ul class="simple">
<li><p>LSTM/GRU cell state computation (candidate values in [-1, 1])</p></li>
<li><p>Output layer when you need symmetric bounded outputs</p></li>
<li><p>Some shallow networks (though ReLU usually preferred now)</p></li>
</ul>
<p><strong>Still Has Vanishing Gradients</strong>: Maximum derivative is 1.0 (better than Sigmoid’s 0.25), but still saturates for |x| &gt; 2, causing vanishing gradients in deep networks.</p>
</section>
<section id="gelu-the-smooth-modern-choice">
<h3>GELU - The Smooth Modern Choice<a class="headerlink" href="#gelu-the-smooth-modern-choice" title="Link to this heading">#</a></h3>
<p>GELU (Gaussian Error Linear Unit) is a smooth approximation to ReLU, used in modern transformer architectures like GPT, BERT, and Vision Transformers.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">GELU</span><span class="p">:</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;GELU activation: f(x) ≈ x * Sigmoid(1.702 * x)&quot;&quot;&quot;</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="c1"># Approximation: x * sigmoid(1.702 * x)</span>
        <span class="n">sigmoid_part</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="o">/</span> <span class="p">(</span><span class="mf">1.0</span> <span class="o">+</span> <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="o">-</span><span class="mf">1.702</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">data</span><span class="p">))</span>
        <span class="k">return</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">data</span> <span class="o">*</span> <span class="n">sigmoid_part</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Mathematical Definition</strong>: <code class="docutils literal notranslate"><span class="pre">GELU(x)</span> <span class="pre">=</span> <span class="pre">x</span> <span class="pre">·</span> <span class="pre">Φ(x)</span> <span class="pre">≈</span> <span class="pre">x</span> <span class="pre">·</span> <span class="pre">σ(1.702x)</span></code> where Φ(x) is the cumulative distribution function of standard normal distribution</p>
<p><strong>Key Properties</strong>:</p>
<ul class="simple">
<li><p><strong>Range</strong>: (-∞, ∞) - unbounded like ReLU</p></li>
<li><p><strong>Gradient</strong>: Smooth everywhere (no sharp corner at x = 0)</p></li>
<li><p><strong>Approximation</strong>: The 1.702 constant comes from √(2/π)</p></li>
<li><p><strong>Computational Cost</strong>: One exponential (similar to Sigmoid)</p></li>
</ul>
<p><strong>Why Transformers Use GELU</strong>:</p>
<ul class="simple">
<li><p>Smooth differentiability everywhere (unlike ReLU’s corner at x = 0)</p></li>
<li><p>Empirically performs better than ReLU in transformer architectures</p></li>
<li><p>Non-monotonic behavior (slight negative region) helps representation learning</p></li>
<li><p>Used in GPT, BERT, RoBERTa, Vision Transformers</p></li>
</ul>
<p><strong>Comparison to ReLU</strong>: GELU is smoother (differentiable everywhere) but more expensive (requires exponential). In transformers, the extra cost is negligible compared to attention computation, and the smoothness helps optimization.</p>
</section>
<section id="softmax-the-probability-distributor">
<h3>Softmax - The Probability Distributor<a class="headerlink" href="#softmax-the-probability-distributor" title="Link to this heading">#</a></h3>
<p>Softmax converts any vector into a valid probability distribution where all outputs are positive and sum to exactly 1.0.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">Softmax</span><span class="p">:</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))&quot;&quot;&quot;</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">dim</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="c1"># Numerical stability: subtract max before exp</span>
        <span class="n">x_max_data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">dim</span><span class="p">,</span> <span class="n">keepdims</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
        <span class="n">x_shifted</span> <span class="o">=</span> <span class="n">x</span> <span class="o">-</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">x_max_data</span><span class="p">)</span>
        <span class="n">exp_values</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="n">x_shifted</span><span class="o">.</span><span class="n">data</span><span class="p">))</span>
        <span class="n">exp_sum</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">exp_values</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">dim</span><span class="p">,</span> <span class="n">keepdims</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
        <span class="k">return</span> <span class="n">exp_values</span> <span class="o">/</span> <span class="n">exp_sum</span>
</pre></div>
</div>
<p><strong>Mathematical Definition</strong>: <code class="docutils literal notranslate"><span class="pre">softmax(x_i)</span> <span class="pre">=</span> <span class="pre">e^(x_i)</span> <span class="pre">/</span> <span class="pre">Σ_j</span> <span class="pre">e^(x_j)</span></code></p>
<p><strong>Key Properties</strong>:</p>
<ul class="simple">
<li><p><strong>Range</strong>: (0, 1) with Σ outputs = 1.0 exactly</p></li>
<li><p><strong>Gradient</strong>: Complex (involves all elements, not just element-wise)</p></li>
<li><p><strong>Translation Invariant</strong>: softmax(x + c) = softmax(x)</p></li>
<li><p><strong>Computational Cost</strong>: One exponential per element + sum reduction</p></li>
</ul>
<p><strong>Numerical Stability Critical</strong>:</p>
<ul class="simple">
<li><p>Naive <code class="docutils literal notranslate"><span class="pre">exp(x_i)</span> <span class="pre">/</span> <span class="pre">sum(exp(x_j))</span></code> overflows for large values</p></li>
<li><p>Subtract max before exponential: <code class="docutils literal notranslate"><span class="pre">exp(x</span> <span class="pre">-</span> <span class="pre">max(x))</span></code></p></li>
<li><p>Mathematically equivalent due to translation invariance</p></li>
<li><p>Prevents overflow while maintaining correct probabilities</p></li>
</ul>
<p><strong>Production Use Cases</strong>:</p>
<ul class="simple">
<li><p>Multi-class classification output layer (class probabilities)</p></li>
<li><p>Attention weights in transformers (probability distribution over sequence)</p></li>
<li><p>Any time you need a valid discrete probability distribution</p></li>
</ul>
<p><strong>Cross-Entropy Connection</strong>: In practice, Softmax is almost always paired with cross-entropy loss. PyTorch’s <code class="docutils literal notranslate"><span class="pre">F.cross_entropy</span></code> combines both operations with additional numerical stability (LogSumExp trick).</p>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure you have completed Module 01 (Tensor) before starting:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch

<span class="c1"># Verify tensor module is complete</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>--module<span class="w"> </span>tensor

<span class="c1"># Expected: ✓ Module 01 complete!</span>
</pre></div>
</div>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/02_activations/activations_dev.ipynb</span></code> (or <code class="docutils literal notranslate"><span class="pre">.py</span></code> via Jupytext)</p></li>
<li><p><strong>Implement ReLU</strong>: Simple max(0, x) operation using <code class="docutils literal notranslate"><span class="pre">np.maximum</span></code></p></li>
<li><p><strong>Build Sigmoid</strong>: Implement with numerical stability using conditional computation for positive/negative values</p></li>
<li><p><strong>Create Tanh</strong>: Use <code class="docutils literal notranslate"><span class="pre">np.tanh</span></code> for hyperbolic tangent transformation</p></li>
<li><p><strong>Add GELU</strong>: Implement smooth approximation using <code class="docutils literal notranslate"><span class="pre">x</span> <span class="pre">*</span> <span class="pre">sigmoid(1.702</span> <span class="pre">*</span> <span class="pre">x)</span></code></p></li>
<li><p><strong>Build Softmax</strong>: Implement with max subtraction for numerical stability, handle dimension parameter for multi-dimensional tensors</p></li>
<li><p><strong>Export and verify</strong>: Run <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">02</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">--module</span> <span class="pre">activations</span></code></p></li>
</ol>
<p><strong>Development Tips</strong>:</p>
<ul class="simple">
<li><p>Test with extreme values (±1000) to verify numerical stability</p></li>
<li><p>Verify output ranges: ReLU [0, ∞), Sigmoid (0,1), Tanh (-1,1)</p></li>
<li><p>Check Softmax sums to 1.0 along specified dimension</p></li>
<li><p>Test with multi-dimensional tensors (batches) to ensure shape preservation</p></li>
</ul>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify all activation implementations:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>--module<span class="w"> </span>activations

<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>activations<span class="w"> </span>-v

<span class="c1"># Test specific activation</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/test_activations.py::test_relu<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p>✅ <strong>ReLU Correctness</strong>: Verifies max(0, x) behavior, sparsity property (negative → 0, positive preserved), and proper handling of exactly zero inputs</p></li>
<li><p>✅ <strong>Sigmoid Numerical Stability</strong>: Tests extreme values (±1000) don’t cause overflow/underflow, validates (0,1) range constraints, confirms sigmoid(0) = 0.5 exactly</p></li>
<li><p>✅ <strong>Tanh Properties</strong>: Validates (-1,1) range, symmetry property (tanh(-x) = -tanh(x)), zero-centered behavior (tanh(0) = 0), and extreme value convergence</p></li>
<li><p>✅ <strong>GELU Smoothness</strong>: Confirms smooth differentiability (no sharp corners), validates approximation accuracy (GELU(0) ≈ 0, GELU(1) ≈ 0.84), and checks non-monotonic behavior</p></li>
<li><p>✅ <strong>Softmax Probability Distribution</strong>: Verifies sum equals 1.0 exactly, all outputs in (0,1) range, largest input receives highest probability, numerical stability with large inputs, and correct dimension handling for multi-dimensional tensors</p></li>
</ul>
</section>
<section id="inline-testing-validation">
<h3>Inline Testing &amp; Validation<a class="headerlink" href="#inline-testing-validation" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive inline unit tests that run during development:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Example inline test output</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">ReLU</span><span class="o">...</span>
<span class="err">✅</span> <span class="n">ReLU</span> <span class="n">zeros</span> <span class="n">negative</span> <span class="n">values</span> <span class="n">correctly</span>
<span class="err">✅</span> <span class="n">ReLU</span> <span class="n">preserves</span> <span class="n">positive</span> <span class="n">values</span>
<span class="err">✅</span> <span class="n">ReLU</span> <span class="n">creates</span> <span class="n">sparsity</span> <span class="p">(</span><span class="mi">3</span><span class="o">/</span><span class="mi">4</span> <span class="n">values</span> <span class="n">are</span> <span class="n">zero</span><span class="p">)</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">ReLU</span> <span class="err">✓</span>

<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Sigmoid</span><span class="o">...</span>
<span class="err">✅</span> <span class="n">Sigmoid</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="o">=</span> <span class="mf">0.5</span> <span class="n">exactly</span>
<span class="err">✅</span> <span class="n">All</span> <span class="n">outputs</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="nb">range</span>
<span class="err">✅</span> <span class="n">Numerically</span> <span class="n">stable</span> <span class="k">with</span> <span class="n">extreme</span> <span class="n">values</span> <span class="p">(</span><span class="err">±</span><span class="mi">1000</span><span class="p">)</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">Sigmoid</span> <span class="err">✓</span>

<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Softmax</span><span class="o">...</span>
<span class="err">✅</span> <span class="n">Outputs</span> <span class="nb">sum</span> <span class="n">to</span> <span class="mf">1.0</span> <span class="n">exactly</span>
<span class="err">✅</span> <span class="n">All</span> <span class="n">values</span> <span class="n">positive</span> <span class="ow">and</span> <span class="n">less</span> <span class="n">than</span> <span class="mi">1</span>
<span class="err">✅</span> <span class="n">Largest</span> <span class="nb">input</span> <span class="n">gets</span> <span class="n">highest</span> <span class="n">probability</span>
<span class="err">✅</span> <span class="n">Handles</span> <span class="n">large</span> <span class="n">numbers</span> <span class="n">without</span> <span class="n">overflow</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">Softmax</span> <span class="err">✓</span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<p>Test activations interactively to understand their behavior:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">activations_dev</span><span class="w"> </span><span class="kn">import</span> <span class="n">ReLU</span><span class="p">,</span> <span class="n">Sigmoid</span><span class="p">,</span> <span class="n">Tanh</span><span class="p">,</span> <span class="n">GELU</span><span class="p">,</span> <span class="n">Softmax</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.tensor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tensor</span>

<span class="c1"># Test ReLU sparsity</span>
<span class="n">relu</span> <span class="o">=</span> <span class="n">ReLU</span><span class="p">()</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">([</span><span class="o">-</span><span class="mi">2</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">relu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>  <span class="c1"># [0, 0, 0, 1, 2] - 60% sparsity!</span>

<span class="c1"># Test Sigmoid probability mapping</span>
<span class="n">sigmoid</span> <span class="o">=</span> <span class="n">Sigmoid</span><span class="p">()</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">100.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">100.0</span><span class="p">])</span>  <span class="c1"># Extreme values</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">sigmoid</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>  <span class="c1"># [0.5, 1.0, 0.0] - no overflow!</span>

<span class="c1"># Test Softmax probability distribution</span>
<span class="n">softmax</span> <span class="o">=</span> <span class="n">Softmax</span><span class="p">()</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">])</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">softmax</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>  <span class="c1"># [0.09, 0.24, 0.67]</span>
<span class="nb">print</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">sum</span><span class="p">())</span>  <span class="c1"># 1.0 exactly!</span>

<span class="c1"># Test activation chaining (simulate simple network)</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">([[</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]])</span>  <span class="c1"># Batch of 1</span>
<span class="n">hidden</span> <span class="o">=</span> <span class="n">relu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>  <span class="c1"># Hidden layer: [0, 0, 1, 2]</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">softmax</span><span class="p">(</span><span class="n">hidden</span><span class="p">)</span>  <span class="c1"># Output probabilities</span>
<span class="nb">print</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">sum</span><span class="p">())</span>  <span class="c1"># 1.0 - valid distribution!</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-applications">
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Computer Vision Networks</strong>: ResNet-50 applies ReLU to approximately 23 million elements per forward pass (after every convolution), then uses Softmax on 1000 logits for ImageNet classification. How much memory is required just to cache these activations for backpropagation in a batch of 32 images?</p></li>
<li><p><strong>Transformer Language Models</strong>: BERT-Large has 24 layers × 1024 hidden units × sequence length 512 = 12.6M activations per example. With GELU requiring exponential computation, how does this compare to ReLU’s computational cost across a 1M example training run?</p></li>
<li><p><strong>Recurrent Networks</strong>: LSTM cells use 4 gates (input, forget, output, cell) with Sigmoid/Tanh activations at every timestep. For a sequence of length 100 with 512 hidden units, how many exponential operations are required compared to a simple ReLU-based feedforward network?</p></li>
<li><p><strong>Mobile Inference</strong>: On-device neural networks must be extremely efficient. Given that ReLU is a simple comparison while GELU requires exponential computation, what are the latency implications for a 20-layer network running on CPU with no hardware acceleration?</p></li>
</ul>
</section>
<section id="mathematical-foundations">
<h3>Mathematical Foundations<a class="headerlink" href="#mathematical-foundations" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Universal Function Approximation</strong>: The universal approximation theorem states that a neural network with even one hidden layer can approximate any continuous function, BUT only if it has non-linear activations. Why does linearity prevent universal approximation, and what property of non-linear functions (like ReLU, Sigmoid, Tanh) enables it?</p></li>
<li><p><strong>Gradient Flow and Saturation</strong>: Sigmoid’s derivative is σ(x)(1-σ(x)) with maximum value 0.25. In a 10-layer network using Sigmoid activations, what is the maximum gradient magnitude at layer 1 if the output gradient is 1.0? How does this explain the vanishing gradient problem that led to ReLU’s adoption?</p></li>
<li><p><strong>Numerical Stability and Conditioning</strong>: When computing Softmax, why does subtracting the maximum value before exponential (exp(x - max(x))) prevent overflow while maintaining mathematical correctness? What property of the exponential function makes this transformation valid?</p></li>
<li><p><strong>Activation Sparsity and Compression</strong>: ReLU produces exact zeros (sparse activations) while Sigmoid produces values close to but never exactly zero. How does this affect model compression techniques like pruning and quantization? Why are sparse activations more amenable to INT8 quantization?</p></li>
</ul>
</section>
<section id="performance-characteristics">
<h3>Performance Characteristics<a class="headerlink" href="#performance-characteristics" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Memory Footprint of Activation Caching</strong>: During backpropagation, forward pass activations must be stored to compute gradients. For a ResNet-50 processing 224×224×3 images with batch size 64, activation caching requires approximately 3GB of memory. How does this compare to the model’s parameter memory (25M params × 4 bytes ≈ 100MB)? What is the scaling relationship between batch size and activation memory?</p></li>
<li><p><strong>Computational Intensity on Different Hardware</strong>: ReLU is trivially parallelizable (independent element-wise max). On a GPU with 10,000 CUDA cores, what is the theoretical speedup vs single-core CPU? Why does practical speedup plateau at much lower values (memory bandwidth, kernel launch overhead)?</p></li>
<li><p><strong>Branch Prediction and CPU Performance</strong>: ReLU’s conditional behavior (<code class="docutils literal notranslate"><span class="pre">if</span> <span class="pre">x</span> <span class="pre">&gt;</span> <span class="pre">0</span></code>) can cause branch misprediction penalties on CPUs. For a random uniform distribution of inputs [-1, 1], branch prediction accuracy is ~50%. How does this affect CPU performance compared to branchless implementations using <code class="docutils literal notranslate"><span class="pre">max(0,</span> <span class="pre">x)</span></code>?</p></li>
<li><p><strong>Exponential Computation Cost</strong>: Sigmoid, Tanh, GELU, and Softmax all require exponential computation. On modern CPUs, <code class="docutils literal notranslate"><span class="pre">exp(x)</span></code> takes ~10-20 cycles vs ~1 cycle for addition. For a network with 1M activations, how does this computational difference compound across training iterations? Why do modern frameworks use lookup tables or polynomial approximations for exponentials?</p></li>
</ul>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>You’re about to implement the mathematical functions that give neural networks their power to learn complex patterns! Every breakthrough in deep learning—from AlexNet’s ImageNet victory to GPT’s language understanding to diffusion models’ image generation—relies on the simple activation functions you’ll build in this module.</p>
<p>Understanding activations from first principles means implementing their mathematics, handling numerical stability edge cases (overflow, underflow), and grasping their properties (ranges, gradients, symmetry). This knowledge will give you deep insight into why ReLU dominates hidden layers, why Sigmoid creates effective gates in LSTMs, why Tanh helps optimization, why GELU powers transformers, and why Softmax is essential for classification. You’ll understand exactly what happens when you call <code class="docutils literal notranslate"><span class="pre">F.relu(x)</span></code> or <code class="docutils literal notranslate"><span class="pre">torch.sigmoid(x)</span></code> in production code—not just the API, but the actual math, numerical considerations, and performance implications.</p>
<p>This is where pure mathematics meets practical machine learning. Take your time with each activation, test thoroughly with extreme values, visualize their behavior across input ranges, and enjoy building the non-linearity that powers modern AI. Let’s turn linear transformations into intelligent representations!</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
🚀 Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/02_activations/activations_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/02_activations/activations_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
⚡ Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/02_activations/activations_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/02_activations/activations_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
📖 View Source</div>
<p class="sd-card-text">Browse the Python source code and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/02_activations/activations_dev.py"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/02_activations/activations_dev.py</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">💾 Save Your Progress</p>
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../modules/01_tensor_ABOUT.html" title="previous page">← Previous Module</a>
<a class="right-next" href="../modules/03_layers_ABOUT.html" title="next page">Next Module →</a>
</div>
</section>
</section>

    <script type="text/x-thebe-config">
    {
        requestKernel: true,
        binderOptions: {
            repo: "binder-examples/jupyter-stacks-datascience",
            ref: "master",
        },
        codeMirrorConfig: {
            theme: "abcdef",
            mode: "python"
        },
        kernelOptions: {
            name: "python3",
            path: "./modules"
        },
        predefinedOutput: true
    }
    </script>
    <script>kernelName = 'python3'</script>

                </article>


                <footer class="prev-next-footer d-print-none">

<div class="prev-next-area">
    <a class="left-prev"
       href="01_tensor_ABOUT.html"
       title="previous page">
      <i class="fa-solid fa-angle-left"></i>
      <div class="prev-next-info">
        <p class="prev-next-subtitle">previous</p>
        <p class="prev-next-title">01. Tensor</p>
      </div>
    </a>
    <a class="right-next"
       href="03_layers_ABOUT.html"
       title="next page">
      <div class="prev-next-info">
        <p class="prev-next-subtitle">next</p>
        <p class="prev-next-title">03. Layers</p>
      </div>
      <i class="fa-solid fa-angle-right"></i>
    </a>
</div>
                </footer>

            </div>


                <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">


  <div class="sidebar-secondary-item">
  <div class="page-toc tocsection onthispage">
    <i class="fa-solid fa-list"></i> Contents
  </div>
  <nav class="bd-toc-nav page-toc">
    <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-reflect">Build → Use → Reflect</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#relu-the-sparsity-creator">ReLU - The Sparsity Creator</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#sigmoid-the-probabilistic-gate">Sigmoid - The Probabilistic Gate</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tanh-the-zero-centered-alternative">Tanh - The Zero-Centered Alternative</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#gelu-the-smooth-modern-choice">GELU - The Smooth Modern Choice</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#softmax-the-probability-distributor">Softmax - The Probability Distributor</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mathematical-foundations">Mathematical Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
  </nav></div>

</div></div>


          </div>
          <footer class="bd-footer-content">

<div class="bd-footer-content__inner container">

  <div class="footer-item">

<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>

  </div>

  <div class="footer-item">


  <p class="copyright">

      © Copyright 2025.
      <br/>

  </p>

  </div>

  <div class="footer-item">

  </div>

  <div class="footer-item">

  </div>

</div>
          </footer>


      </main>
    </div>
  </div>

  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>

  <footer class="bd-footer">
  </footer>
  </body>
</html>