Files
TinyTorch/dev/modules/10_tokenization_ABOUT.html

1517 lines
122 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
<title>10. Tokenization - Text to Numerical Sequences &#8212; Tiny🔥Torch</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
<link rel="stylesheet" href="../_static/styles/sphinx-book-theme.css?digest=14f4ca6b54d191a8c7657f6c759bf11a5fb86285" type="text/css" />
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css" />
<link rel="stylesheet" type="text/css" href="../_static/custom.css" />
<link rel="stylesheet" type="text/css" href="../_static/design-style.4045f2051d55cab465a707391d5b2007.min.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script src="../_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../_static/togglebutton.js"></script>
<script src="../_static/ml-timeline.js"></script>
<script src="../_static/wip-banner.js"></script>
<script src="../_static/sidebar-link.js"></script>
<script src="../_static/hero-carousel.js"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="../_static/design-tabs.js"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"
const thebe_selector = ".thebe,.cell"
const thebe_selector_input = "pre"
const thebe_selector_output = ".output, .cell_output"
</script>
<script async="async" src="../_static/sphinx-thebe.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/10_tokenization_ABOUT';</script>
<link rel="shortcut icon" href="../_static/favicon.svg"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="11. Embeddings - Token to Vector Representations" href="11_embeddings_ABOUT.html" />
<link rel="prev" title="09. Spatial Operations" href="09_spatial_ABOUT.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search this book..."
aria-label="Search this book..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../intro.html">
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<ul class="nav bd-sidenav bd-sidenav__home-link">
<li class="toctree-l1">
<a class="reference internal" href="../intro.html">
Getting Started
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../quickstart-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../student-workflow.html">Student Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../usage-paths/classroom-use.html">For Instructors</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><label class="sidebar-toggle primary-toggle btn btn-sm" for="__primary" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</label></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-source-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Source repositories">
<i class="fab fa-github"></i>
</button>
<ul class="dropdown-menu">
<li><a href="https://github.com/mlsysbook/TinyTorch" target="_blank"
class="btn btn-sm btn-source-repository-button dropdown-item"
title="Source repository"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fab fa-github"></i>
</span>
<span class="btn__text-container">Repository</span>
</a>
</li>
<li><a href="https://github.com/mlsysbook/TinyTorch/edit/main/site/modules/10_tokenization_ABOUT.md" target="_blank"
class="btn btn-sm btn-source-edit-button dropdown-item"
title="Suggest edit"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-pencil-alt"></i>
</span>
<span class="btn__text-container">Suggest edit</span>
</a>
</li>
<li><a href="https://github.com/mlsysbook/TinyTorch/issues/new?title=Issue%20on%20page%20%2Fmodules/10_tokenization_ABOUT.html&body=Your%20issue%20content%20here." target="_blank"
class="btn btn-sm btn-source-issues-button dropdown-item"
title="Open an issue"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-lightbulb"></i>
</span>
<span class="btn__text-container">Open issue</span>
</a>
</li>
</ul>
</div>
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../_sources/modules/10_tokenization_ABOUT.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<label class="sidebar-toggle secondary-toggle btn btn-sm" for="__secondary"title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</label>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>10. Tokenization - Text to Numerical Sequences</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-reflect">Build → Use → Reflect</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#base-tokenizer-interface">Base Tokenizer Interface</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#character-level-tokenizer">Character-Level Tokenizer</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#bpe-byte-pair-encoding-tokenizer">BPE (Byte Pair Encoding) Tokenizer</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tokenization-utilities">Tokenization Utilities</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tokenization-foundations">Tokenization Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="tokenization-text-to-numerical-sequences">
<h1>10. Tokenization - Text to Numerical Sequences<a class="headerlink" href="#tokenization-text-to-numerical-sequences" title="Permalink to this heading">#</a></h1>
<p><strong>ARCHITECTURE TIER</strong> | Difficulty: ⭐⭐ (2/4) | Time: 4-5 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Permalink to this heading">#</a></h2>
<p>Build tokenization systems that convert raw text into numerical sequences for language models. This module implements character-level and Byte Pair Encoding (BPE) tokenizers that balance vocabulary size, sequence length, and computational efficiency—the fundamental trade-off shaping every modern NLP system from GPT-4 to Google Translate. Youll understand why vocabulary size directly affects model parameters while sequence length impacts transformer computation, and how BPE optimally balances both extremes.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Permalink to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Implement character-level tokenization with vocabulary management</strong>: Build tokenizers with bidirectional token-to-ID mappings, special token handling (PAD, UNK, BOS, EOS), and graceful unknown character handling for robust multilingual support</p></li>
<li><p><strong>Build BPE (Byte Pair Encoding) tokenizer</strong>: Implement the iterative merge algorithm that learns optimal subword units by counting character pair frequencies—the same approach powering GPT, BERT, and modern transformers</p></li>
<li><p><strong>Understand vocabulary size vs sequence length trade-offs</strong>: Analyze how vocabulary choices affect model parameters (embedding matrix size = vocab_size × embed_dim) and computation (transformer attention is O(n²) in sequence length)</p></li>
<li><p><strong>Design efficient text processing pipelines</strong>: Create production-ready tokenizers with encoding/decoding, vocabulary serialization for deployment, and proper special token management for batching</p></li>
<li><p><strong>Analyze tokenization throughput and compression ratios</strong>: Measure tokens/second performance, compare character vs BPE on sequence length reduction, and understand scaling to billions of tokens in production systems</p></li>
</ul>
</section>
<section id="build-use-reflect">
<h2>Build → Use → Reflect<a class="headerlink" href="#build-use-reflect" title="Permalink to this heading">#</a></h2>
<p>This module follows TinyTorchs <strong>Build → Use → Reflect</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement character-level tokenizer with vocabulary building and encode/decode operations, then build BPE algorithm that iteratively merges frequent character pairs to learn optimal subword units</p></li>
<li><p><strong>Use</strong>: Tokenize Shakespeare and modern text datasets, compare character vs BPE on sequence length reduction, measure tokenization throughput on large corpora, and test subword decomposition on rare/unknown words</p></li>
<li><p><strong>Reflect</strong>: Why does vocabulary size directly control model parameters (embedding matrix rows)? How does sequence length affect transformer computation (O(n²) attention)? Whats the optimal balance for mobile deployment vs cloud serving? How do tokenization choices impact multilingual model design?</p></li>
</ol>
<div class="tip admonition">
<p class="admonition-title">Systems Reality Check</p>
<p><strong>Production Context</strong>: GPT-4 uses a 100K-token vocabulary trained on trillions of tokens. Every token in the vocabulary adds a row to the embedding matrix—at 12,288 dimensions, thats 1.2B parameters just for embeddings. Meanwhile, transformers have O(n²) attention complexity, so reducing sequence length from 1000 to 300 tokens cuts computation by 11x. This vocabulary size vs sequence length trade-off shapes every design decision in modern NLP: GPT-3 doubled vocabulary from GPT-2 (50K→100K) specifically to handle code and reduce sequence lengths for long documents.</p>
<p><strong>Performance Note</strong>: Google Translate processes billions of sentences daily through tokenization pipelines. Tokenization throughput (measured in tokens/second) is critical for serving at scale—character-level achieves ~1M tokens/sec (simple lookup) while BPE achieves ~100K tokens/sec (iterative merge application). Production systems cache tokenization results and batch aggressively to amortize preprocessing costs. At OpenAIs scale ($700/million tokens), every tokenization optimization directly impacts economics.</p>
</div>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Permalink to this heading">#</a></h2>
<section id="base-tokenizer-interface">
<h3>Base Tokenizer Interface<a class="headerlink" href="#base-tokenizer-interface" title="Permalink to this heading">#</a></h3>
<p>All tokenizers share a common interface: encode text to token IDs and decode IDs back to text. This abstraction enables consistent usage across different tokenization strategies.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">Tokenizer</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Base tokenizer interface defining the contract for all tokenizers.</span>
<span class="sd"> All tokenization strategies (character, BPE, WordPiece) must implement:</span>
<span class="sd"> - encode(text) → List[int]: Convert text to token IDs</span>
<span class="sd"> - decode(token_ids) → str: Convert token IDs back to text</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="nf">encode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Convert text to list of token IDs.&quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;Subclasses must implement encode()&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">decode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Convert token IDs back to text.&quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;Subclasses must implement decode()&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Design Pattern</strong>: Abstract base class enforces consistent API across tokenization strategies, enabling drop-in replacement for performance testing (character vs BPE benchmarks).</p>
</section>
<section id="character-level-tokenizer">
<h3>Character-Level Tokenizer<a class="headerlink" href="#character-level-tokenizer" title="Permalink to this heading">#</a></h3>
<p>The simplest tokenization approach: each character becomes a token. Provides perfect coverage of any text with a tiny vocabulary (~100 characters), but produces long sequences.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">CharTokenizer</span><span class="p">(</span><span class="n">Tokenizer</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Character-level tokenizer treating each character as a separate token.</span>
<span class="sd"> Trade-offs:</span>
<span class="sd"> - Small vocabulary (typically 100-500 characters)</span>
<span class="sd"> - Long sequences (1 character = 1 token)</span>
<span class="sd"> - Perfect coverage (no unknown tokens if vocab includes all Unicode)</span>
<span class="sd"> - Simple implementation (direct character-to-ID mapping)</span>
<span class="sd"> Example:</span>
<span class="sd"> &quot;hello&quot; → [&#39;h&#39;,&#39;e&#39;,&#39;l&#39;,&#39;l&#39;,&#39;o&#39;] → [8, 5, 12, 12, 15] (5 tokens)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">vocab</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Initialize with optional vocabulary.</span>
<span class="sd"> Args:</span>
<span class="sd"> vocab: List of characters to include in vocabulary.</span>
<span class="sd"> If None, vocabulary is built later via build_vocab().</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">vocab</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">vocab</span> <span class="o">=</span> <span class="p">[]</span>
<span class="c1"># Reserve ID 0 for unknown token (robust handling of unseen characters)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;&lt;UNK&gt;&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="n">vocab</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)</span>
<span class="c1"># Bidirectional mappings for efficient encode/decode</span>
<span class="bp">self</span><span class="o">.</span><span class="n">char_to_id</span> <span class="o">=</span> <span class="p">{</span><span class="n">char</span><span class="p">:</span> <span class="n">idx</span> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">char</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)}</span>
<span class="bp">self</span><span class="o">.</span><span class="n">id_to_char</span> <span class="o">=</span> <span class="p">{</span><span class="n">idx</span><span class="p">:</span> <span class="n">char</span> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">char</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)}</span>
<span class="c1"># Cache unknown token ID for fast lookup</span>
<span class="bp">self</span><span class="o">.</span><span class="n">unk_id</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">def</span><span class="w"> </span><span class="nf">build_vocab</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">corpus</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Build vocabulary from text corpus.</span>
<span class="sd"> Args:</span>
<span class="sd"> corpus: List of text strings to extract characters from.</span>
<span class="sd"> Process:</span>
<span class="sd"> 1. Collect all unique characters across entire corpus</span>
<span class="sd"> 2. Sort alphabetically for consistent ordering across runs</span>
<span class="sd"> 3. Rebuild char↔ID mappings with &lt;UNK&gt; token at position 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Extract all unique characters</span>
<span class="n">all_chars</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">for</span> <span class="n">text</span> <span class="ow">in</span> <span class="n">corpus</span><span class="p">:</span>
<span class="n">all_chars</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="c1"># Sort for reproducibility (important for model deployment)</span>
<span class="n">unique_chars</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">all_chars</span><span class="p">))</span>
<span class="c1"># Rebuild vocabulary with special token first</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;&lt;UNK&gt;&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="n">unique_chars</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)</span>
<span class="c1"># Rebuild bidirectional mappings</span>
<span class="bp">self</span><span class="o">.</span><span class="n">char_to_id</span> <span class="o">=</span> <span class="p">{</span><span class="n">char</span><span class="p">:</span> <span class="n">idx</span> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">char</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)}</span>
<span class="bp">self</span><span class="o">.</span><span class="n">id_to_char</span> <span class="o">=</span> <span class="p">{</span><span class="n">idx</span><span class="p">:</span> <span class="n">char</span> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">char</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)}</span>
<span class="k">def</span><span class="w"> </span><span class="nf">encode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Convert text to list of character IDs.</span>
<span class="sd"> Args:</span>
<span class="sd"> text: String to tokenize.</span>
<span class="sd"> Returns:</span>
<span class="sd"> List of integer token IDs, one per character.</span>
<span class="sd"> Unknown characters map to ID 0 (&lt;UNK&gt;).</span>
<span class="sd"> Example:</span>
<span class="sd"> &gt;&gt;&gt; tokenizer.encode(&quot;hello&quot;)</span>
<span class="sd"> [8, 5, 12, 12, 15] # Depends on vocabulary ordering</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">tokens</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">char</span> <span class="ow">in</span> <span class="n">text</span><span class="p">:</span>
<span class="c1"># Use .get() with unk_id default for graceful unknown handling</span>
<span class="n">tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">char_to_id</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">char</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unk_id</span><span class="p">))</span>
<span class="k">return</span> <span class="n">tokens</span>
<span class="k">def</span><span class="w"> </span><span class="nf">decode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Convert token IDs back to text.</span>
<span class="sd"> Args:</span>
<span class="sd"> tokens: List of integer token IDs.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Reconstructed text string.</span>
<span class="sd"> Invalid IDs map to &#39;&lt;UNK&gt;&#39; character.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">chars</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">token_id</span> <span class="ow">in</span> <span class="n">tokens</span><span class="p">:</span>
<span class="n">char</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">id_to_char</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">token_id</span><span class="p">,</span> <span class="s1">&#39;&lt;UNK&gt;&#39;</span><span class="p">)</span>
<span class="n">chars</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">char</span><span class="p">)</span>
<span class="k">return</span> <span class="s1">&#39;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">chars</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Key Implementation Details:</strong></p>
<ul class="simple">
<li><p><strong>Special Token Reservation</strong>: <code class="docutils literal notranslate"><span class="pre">&lt;UNK&gt;</span></code> token must occupy ID 0 consistently across vocabularies for model compatibility</p></li>
<li><p><strong>Bidirectional Mappings</strong>: Both <code class="docutils literal notranslate"><span class="pre">char_to_id</span></code> (encoding) and <code class="docutils literal notranslate"><span class="pre">id_to_char</span></code> (decoding) enable O(1) lookup performance</p></li>
<li><p><strong>Unknown Character Handling</strong>: Graceful degradation prevents crashes on unseen characters (critical for multilingual models encountering rare Unicode)</p></li>
<li><p><strong>Vocabulary Consistency</strong>: Sorted character ordering ensures reproducible vocabularies across training runs (important for model deployment)</p></li>
</ul>
</section>
<section id="bpe-byte-pair-encoding-tokenizer">
<h3>BPE (Byte Pair Encoding) Tokenizer<a class="headerlink" href="#bpe-byte-pair-encoding-tokenizer" title="Permalink to this heading">#</a></h3>
<p>The algorithm powering GPT and modern transformers: iteratively merge frequent character pairs to discover optimal subword units. Balances vocabulary size (model parameters) with sequence length (computational cost).</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">BPETokenizer</span><span class="p">(</span><span class="n">Tokenizer</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Byte Pair Encoding tokenizer for subword tokenization.</span>
<span class="sd"> Algorithm:</span>
<span class="sd"> 1. Initialize: Start with character-level vocabulary</span>
<span class="sd"> 2. Count: Find all adjacent character pair frequencies in corpus</span>
<span class="sd"> 3. Merge: Replace most frequent pair with new merged token</span>
<span class="sd"> 4. Repeat: Continue until vocabulary reaches target size</span>
<span class="sd"> Trade-offs:</span>
<span class="sd"> - Larger vocabulary (typically 10K-50K tokens)</span>
<span class="sd"> - Shorter sequences (2-4x compression vs character-level)</span>
<span class="sd"> - Subword decomposition handles rare/unknown words gracefully</span>
<span class="sd"> - Training complexity (requires corpus statistics)</span>
<span class="sd"> Example:</span>
<span class="sd"> Training: &quot;hello&quot; appears 1000x, &quot;hell&quot; appears 500x</span>
<span class="sd"> Learns: &#39;h&#39;+&#39;e&#39;&#39;he&#39; (freq pair), &#39;l&#39;+&#39;l&#39;&#39;ll&#39; (freq pair)</span>
<span class="sd"> Result: &quot;hello&quot; → [&#39;he&#39;, &#39;ll&#39;, &#39;o&lt;/w&gt;&#39;] (3 tokens vs 5 characters)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">vocab_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1000</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Initialize BPE tokenizer.</span>
<span class="sd"> Args:</span>
<span class="sd"> vocab_size: Target vocabulary size (includes special tokens +</span>
<span class="sd"> characters + learned merges). Typical: 10K-50K.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span> <span class="o">=</span> <span class="n">vocab_size</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab</span> <span class="o">=</span> <span class="p">[]</span> <span class="c1"># Final vocabulary tokens</span>
<span class="bp">self</span><span class="o">.</span><span class="n">merges</span> <span class="o">=</span> <span class="p">[]</span> <span class="c1"># Learned merge rules: [(pair, merged_token), ...]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">token_to_id</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># Token string → integer ID</span>
<span class="bp">self</span><span class="o">.</span><span class="n">id_to_token</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># Integer ID → token string</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_get_word_tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Convert word to character tokens with end-of-word marker.</span>
<span class="sd"> Args:</span>
<span class="sd"> word: String to tokenize at character level.</span>
<span class="sd"> Returns:</span>
<span class="sd"> List of character tokens with &#39;&lt;/w&gt;&#39; suffix on last character.</span>
<span class="sd"> End-of-word marker enables learning of word boundaries.</span>
<span class="sd"> Example:</span>
<span class="sd"> &gt;&gt;&gt; _get_word_tokens(&quot;hello&quot;)</span>
<span class="sd"> [&#39;h&#39;, &#39;e&#39;, &#39;l&#39;, &#39;l&#39;, &#39;o&lt;/w&gt;&#39;]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">word</span><span class="p">:</span>
<span class="k">return</span> <span class="p">[]</span>
<span class="n">tokens</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
<span class="n">tokens</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">+=</span> <span class="s1">&#39;&lt;/w&gt;&#39;</span> <span class="c1"># Mark word boundaries for BPE</span>
<span class="k">return</span> <span class="n">tokens</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_get_pairs</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word_tokens</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Set</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Extract all adjacent character pairs from token sequence.</span>
<span class="sd"> Args:</span>
<span class="sd"> word_tokens: List of token strings.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Set of unique adjacent pairs (useful for frequency counting).</span>
<span class="sd"> Example:</span>
<span class="sd"> &gt;&gt;&gt; _get_pairs([&#39;h&#39;, &#39;e&#39;, &#39;l&#39;, &#39;l&#39;, &#39;o&lt;/w&gt;&#39;])</span>
<span class="sd"> {(&#39;h&#39;, &#39;e&#39;), (&#39;e&#39;, &#39;l&#39;), (&#39;l&#39;, &#39;l&#39;), (&#39;l&#39;, &#39;o&lt;/w&gt;&#39;)}</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">pairs</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">word_tokens</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
<span class="n">pairs</span><span class="o">.</span><span class="n">add</span><span class="p">((</span><span class="n">word_tokens</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">word_tokens</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]))</span>
<span class="k">return</span> <span class="n">pairs</span>
<span class="k">def</span><span class="w"> </span><span class="nf">train</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">corpus</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">vocab_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Train BPE on corpus to learn merge rules.</span>
<span class="sd"> Args:</span>
<span class="sd"> corpus: List of text strings (typically words or sentences).</span>
<span class="sd"> vocab_size: Override target vocabulary size if provided.</span>
<span class="sd"> Training Process:</span>
<span class="sd"> 1. Count word frequencies in corpus</span>
<span class="sd"> 2. Initialize with character-level tokens (all unique characters)</span>
<span class="sd"> 3. Iteratively:</span>
<span class="sd"> a. Count all adjacent pair frequencies across all words</span>
<span class="sd"> b. Merge most frequent pair into new token</span>
<span class="sd"> c. Update word representations with merged token</span>
<span class="sd"> d. Add merged token to vocabulary</span>
<span class="sd"> 4. Stop when vocabulary reaches target size</span>
<span class="sd"> 5. Build final token↔ID mappings</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">vocab_size</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span> <span class="o">=</span> <span class="n">vocab_size</span>
<span class="c1"># Count word frequencies (training on token statistics, not raw text)</span>
<span class="n">word_freq</span> <span class="o">=</span> <span class="n">Counter</span><span class="p">(</span><span class="n">corpus</span><span class="p">)</span>
<span class="c1"># Initialize vocabulary and word token representations</span>
<span class="n">vocab</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="n">word_tokens</span> <span class="o">=</span> <span class="p">{}</span>
<span class="k">for</span> <span class="n">word</span> <span class="ow">in</span> <span class="n">word_freq</span><span class="p">:</span>
<span class="n">tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_word_tokens</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
<span class="n">word_tokens</span><span class="p">[</span><span class="n">word</span><span class="p">]</span> <span class="o">=</span> <span class="n">tokens</span>
<span class="n">vocab</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span> <span class="c1"># Collect all unique character tokens</span>
<span class="c1"># Convert to sorted list for reproducibility</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">vocab</span><span class="p">))</span>
<span class="c1"># Add special unknown token</span>
<span class="k">if</span> <span class="s1">&#39;&lt;UNK&gt;&#39;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;&lt;UNK&gt;&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">vocab</span>
<span class="c1"># Learn merge rules iteratively</span>
<span class="bp">self</span><span class="o">.</span><span class="n">merges</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">while</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span><span class="p">:</span>
<span class="c1"># Count all adjacent pairs across all words (weighted by frequency)</span>
<span class="n">pair_counts</span> <span class="o">=</span> <span class="n">Counter</span><span class="p">()</span>
<span class="k">for</span> <span class="n">word</span><span class="p">,</span> <span class="n">freq</span> <span class="ow">in</span> <span class="n">word_freq</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">tokens</span> <span class="o">=</span> <span class="n">word_tokens</span><span class="p">[</span><span class="n">word</span><span class="p">]</span>
<span class="n">pairs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_pairs</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span>
<span class="k">for</span> <span class="n">pair</span> <span class="ow">in</span> <span class="n">pairs</span><span class="p">:</span>
<span class="n">pair_counts</span><span class="p">[</span><span class="n">pair</span><span class="p">]</span> <span class="o">+=</span> <span class="n">freq</span> <span class="c1"># Weight by word frequency</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">pair_counts</span><span class="p">:</span>
<span class="k">break</span> <span class="c1"># No more pairs to merge</span>
<span class="c1"># Select most frequent pair</span>
<span class="n">best_pair</span> <span class="o">=</span> <span class="n">pair_counts</span><span class="o">.</span><span class="n">most_common</span><span class="p">(</span><span class="mi">1</span><span class="p">)[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="c1"># Apply merge to all word representations</span>
<span class="k">for</span> <span class="n">word</span> <span class="ow">in</span> <span class="n">word_tokens</span><span class="p">:</span>
<span class="n">tokens</span> <span class="o">=</span> <span class="n">word_tokens</span><span class="p">[</span><span class="n">word</span><span class="p">]</span>
<span class="n">new_tokens</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">i</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">while</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">tokens</span><span class="p">):</span>
<span class="c1"># Check if current position matches merge pair</span>
<span class="k">if</span> <span class="p">(</span><span class="n">i</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span> <span class="ow">and</span>
<span class="n">tokens</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">best_pair</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">and</span>
<span class="n">tokens</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span> <span class="o">==</span> <span class="n">best_pair</span><span class="p">[</span><span class="mi">1</span><span class="p">]):</span>
<span class="c1"># Merge pair into single token</span>
<span class="n">new_tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">best_pair</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">best_pair</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="n">i</span> <span class="o">+=</span> <span class="mi">2</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">tokens</span><span class="p">[</span><span class="n">i</span><span class="p">])</span>
<span class="n">i</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="n">word_tokens</span><span class="p">[</span><span class="n">word</span><span class="p">]</span> <span class="o">=</span> <span class="n">new_tokens</span>
<span class="c1"># Add merged token to vocabulary</span>
<span class="n">merged_token</span> <span class="o">=</span> <span class="n">best_pair</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">best_pair</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">merged_token</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">merges</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">best_pair</span><span class="p">)</span>
<span class="c1"># Build final token↔ID mappings for efficient encode/decode</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_build_mappings</span><span class="p">()</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_build_mappings</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Build bidirectional token↔ID mappings from vocabulary.&quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">token_to_id</span> <span class="o">=</span> <span class="p">{</span><span class="n">token</span><span class="p">:</span> <span class="n">idx</span> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">token</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)}</span>
<span class="bp">self</span><span class="o">.</span><span class="n">id_to_token</span> <span class="o">=</span> <span class="p">{</span><span class="n">idx</span><span class="p">:</span> <span class="n">token</span> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">token</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">)}</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_apply_merges</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Apply learned merge rules to token sequence.</span>
<span class="sd"> Args:</span>
<span class="sd"> tokens: List of character-level tokens.</span>
<span class="sd"> Returns:</span>
<span class="sd"> List of tokens after applying all learned merges.</span>
<span class="sd"> Process:</span>
<span class="sd"> Apply each merge rule in the order learned during training.</span>
<span class="sd"> Early merges have priority over later merges.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">merges</span><span class="p">:</span>
<span class="k">return</span> <span class="n">tokens</span>
<span class="c1"># Apply each merge rule sequentially</span>
<span class="k">for</span> <span class="n">merge_pair</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">merges</span><span class="p">:</span>
<span class="n">new_tokens</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">i</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">while</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">tokens</span><span class="p">):</span>
<span class="k">if</span> <span class="p">(</span><span class="n">i</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span> <span class="ow">and</span>
<span class="n">tokens</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">merge_pair</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">and</span>
<span class="n">tokens</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span> <span class="o">==</span> <span class="n">merge_pair</span><span class="p">[</span><span class="mi">1</span><span class="p">]):</span>
<span class="c1"># Apply merge</span>
<span class="n">new_tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">merge_pair</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">merge_pair</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="n">i</span> <span class="o">+=</span> <span class="mi">2</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">tokens</span><span class="p">[</span><span class="n">i</span><span class="p">])</span>
<span class="n">i</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="n">tokens</span> <span class="o">=</span> <span class="n">new_tokens</span>
<span class="k">return</span> <span class="n">tokens</span>
<span class="k">def</span><span class="w"> </span><span class="nf">encode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Encode text using learned BPE merges.</span>
<span class="sd"> Args:</span>
<span class="sd"> text: String to tokenize.</span>
<span class="sd"> Returns:</span>
<span class="sd"> List of integer token IDs after applying BPE merges.</span>
<span class="sd"> Process:</span>
<span class="sd"> 1. Split text into words (simple whitespace split)</span>
<span class="sd"> 2. Convert each word to character-level tokens</span>
<span class="sd"> 3. Apply learned BPE merges to create subword units</span>
<span class="sd"> 4. Convert subword tokens to integer IDs</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">vocab</span><span class="p">:</span>
<span class="k">return</span> <span class="p">[]</span>
<span class="c1"># Simple word splitting (production systems use more sophisticated approaches)</span>
<span class="n">words</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">split</span><span class="p">()</span>
<span class="n">all_tokens</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">word</span> <span class="ow">in</span> <span class="n">words</span><span class="p">:</span>
<span class="c1"># Start with character-level tokens</span>
<span class="n">word_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_word_tokens</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
<span class="c1"># Apply BPE merges</span>
<span class="n">merged_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_merges</span><span class="p">(</span><span class="n">word_tokens</span><span class="p">)</span>
<span class="n">all_tokens</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">merged_tokens</span><span class="p">)</span>
<span class="c1"># Convert tokens to IDs (unknown tokens map to ID 0)</span>
<span class="n">token_ids</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">token</span> <span class="ow">in</span> <span class="n">all_tokens</span><span class="p">:</span>
<span class="n">token_ids</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">token_to_id</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">token</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span>
<span class="k">return</span> <span class="n">token_ids</span>
<span class="k">def</span><span class="w"> </span><span class="nf">decode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Decode token IDs back to text.</span>
<span class="sd"> Args:</span>
<span class="sd"> tokens: List of integer token IDs.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Reconstructed text string.</span>
<span class="sd"> Process:</span>
<span class="sd"> 1. Convert IDs to token strings</span>
<span class="sd"> 2. Join tokens together</span>
<span class="sd"> 3. Remove end-of-word markers and restore spaces</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">id_to_token</span><span class="p">:</span>
<span class="k">return</span> <span class="s2">&quot;&quot;</span>
<span class="c1"># Convert IDs to token strings</span>
<span class="n">token_strings</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">token_id</span> <span class="ow">in</span> <span class="n">tokens</span><span class="p">:</span>
<span class="n">token</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">id_to_token</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">token_id</span><span class="p">,</span> <span class="s1">&#39;&lt;UNK&gt;&#39;</span><span class="p">)</span>
<span class="n">token_strings</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">token</span><span class="p">)</span>
<span class="c1"># Join and clean up</span>
<span class="n">text</span> <span class="o">=</span> <span class="s1">&#39;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">token_strings</span><span class="p">)</span>
<span class="c1"># Replace end-of-word markers with spaces</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">&#39;&lt;/w&gt;&#39;</span><span class="p">,</span> <span class="s1">&#39; &#39;</span><span class="p">)</span>
<span class="c1"># Clean up extra spaces</span>
<span class="n">text</span> <span class="o">=</span> <span class="s1">&#39; &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">text</span><span class="o">.</span><span class="n">split</span><span class="p">())</span>
<span class="k">return</span> <span class="n">text</span>
</pre></div>
</div>
<p><strong>BPE Algorithm Insights:</strong></p>
<ul class="simple">
<li><p><strong>Training Phase</strong>: Learn merge rules from corpus statistics by iteratively merging most frequent adjacent pairs</p></li>
<li><p><strong>Inference Phase</strong>: Apply learned merges in order to segment new text into optimal subword units</p></li>
<li><p><strong>Frequency-Based Learning</strong>: Common patterns (“ing”, “ed”, “tion”) become single tokens, reducing sequence length</p></li>
<li><p><strong>Graceful Degradation</strong>: Unseen words decompose into known subwords (e.g., “unhappiness” → [“un”, “happi”, “ness”])</p></li>
<li><p><strong>Word Boundary Awareness</strong>: End-of-word markers (<code class="docutils literal notranslate"><span class="pre">&lt;/w&gt;</span></code>) enable learning of prefix vs suffix patterns</p></li>
</ul>
</section>
<section id="tokenization-utilities">
<h3>Tokenization Utilities<a class="headerlink" href="#tokenization-utilities" title="Permalink to this heading">#</a></h3>
<p>Production-ready utilities for tokenizer creation, dataset processing, and performance analysis.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">create_tokenizer</span><span class="p">(</span><span class="n">strategy</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;char&quot;</span><span class="p">,</span>
<span class="n">vocab_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1000</span><span class="p">,</span>
<span class="n">corpus</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tokenizer</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Factory function to create and train tokenizers.</span>
<span class="sd"> Args:</span>
<span class="sd"> strategy: Tokenization approach (&quot;char&quot; or &quot;bpe&quot;).</span>
<span class="sd"> vocab_size: Target vocabulary size (for BPE).</span>
<span class="sd"> corpus: Training corpus for vocabulary building.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Trained tokenizer instance.</span>
<span class="sd"> Example:</span>
<span class="sd"> &gt;&gt;&gt; corpus = [&quot;hello world&quot;, &quot;machine learning&quot;]</span>
<span class="sd"> &gt;&gt;&gt; tokenizer = create_tokenizer(&quot;bpe&quot;, vocab_size=500, corpus=corpus)</span>
<span class="sd"> &gt;&gt;&gt; tokens = tokenizer.encode(&quot;hello&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">strategy</span> <span class="o">==</span> <span class="s2">&quot;char&quot;</span><span class="p">:</span>
<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">CharTokenizer</span><span class="p">()</span>
<span class="k">if</span> <span class="n">corpus</span><span class="p">:</span>
<span class="n">tokenizer</span><span class="o">.</span><span class="n">build_vocab</span><span class="p">(</span><span class="n">corpus</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">strategy</span> <span class="o">==</span> <span class="s2">&quot;bpe&quot;</span><span class="p">:</span>
<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">BPETokenizer</span><span class="p">(</span><span class="n">vocab_size</span><span class="o">=</span><span class="n">vocab_size</span><span class="p">)</span>
<span class="k">if</span> <span class="n">corpus</span><span class="p">:</span>
<span class="n">tokenizer</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">corpus</span><span class="p">,</span> <span class="n">vocab_size</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unknown tokenization strategy: </span><span class="si">{</span><span class="n">strategy</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">tokenizer</span>
<span class="k">def</span><span class="w"> </span><span class="nf">analyze_tokenization</span><span class="p">(</span><span class="n">texts</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
<span class="n">tokenizer</span><span class="p">:</span> <span class="n">Tokenizer</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Analyze tokenization statistics for performance evaluation.</span>
<span class="sd"> Args:</span>
<span class="sd"> texts: List of text strings to analyze.</span>
<span class="sd"> tokenizer: Trained tokenizer instance.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Dictionary containing:</span>
<span class="sd"> - vocab_size: Number of unique tokens in vocabulary</span>
<span class="sd"> - avg_sequence_length: Mean tokens per text</span>
<span class="sd"> - max_sequence_length: Longest tokenized sequence</span>
<span class="sd"> - total_tokens: Total tokens across all texts</span>
<span class="sd"> - compression_ratio: Average characters per token (higher = better)</span>
<span class="sd"> - unique_tokens: Number of distinct tokens used</span>
<span class="sd"> Use Cases:</span>
<span class="sd"> - Compare character vs BPE on sequence length reduction</span>
<span class="sd"> - Measure compression efficiency (chars/token ratio)</span>
<span class="sd"> - Identify vocabulary utilization (unique_tokens / vocab_size)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">all_tokens</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">total_chars</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">text</span> <span class="ow">in</span> <span class="n">texts</span><span class="p">:</span>
<span class="n">tokens</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="n">all_tokens</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span>
<span class="n">total_chars</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="n">tokenized_lengths</span> <span class="o">=</span> <span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">text</span><span class="p">))</span> <span class="k">for</span> <span class="n">text</span> <span class="ow">in</span> <span class="n">texts</span><span class="p">]</span>
<span class="n">stats</span> <span class="o">=</span> <span class="p">{</span>
<span class="s1">&#39;vocab_size&#39;</span><span class="p">:</span> <span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">vocab_size</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="s1">&#39;vocab_size&#39;</span><span class="p">)</span>
<span class="k">else</span> <span class="nb">len</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">vocab</span><span class="p">)),</span>
<span class="s1">&#39;avg_sequence_length&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">tokenized_lengths</span><span class="p">),</span>
<span class="s1">&#39;max_sequence_length&#39;</span><span class="p">:</span> <span class="nb">max</span><span class="p">(</span><span class="n">tokenized_lengths</span><span class="p">)</span> <span class="k">if</span> <span class="n">tokenized_lengths</span> <span class="k">else</span> <span class="mi">0</span><span class="p">,</span>
<span class="s1">&#39;total_tokens&#39;</span><span class="p">:</span> <span class="nb">len</span><span class="p">(</span><span class="n">all_tokens</span><span class="p">),</span>
<span class="s1">&#39;compression_ratio&#39;</span><span class="p">:</span> <span class="n">total_chars</span> <span class="o">/</span> <span class="nb">len</span><span class="p">(</span><span class="n">all_tokens</span><span class="p">)</span> <span class="k">if</span> <span class="n">all_tokens</span> <span class="k">else</span> <span class="mi">0</span><span class="p">,</span>
<span class="s1">&#39;unique_tokens&#39;</span><span class="p">:</span> <span class="nb">len</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">all_tokens</span><span class="p">))</span>
<span class="p">}</span>
<span class="k">return</span> <span class="n">stats</span>
</pre></div>
</div>
<p><strong>Analysis Metrics Explained:</strong></p>
<ul class="simple">
<li><p><strong>Compression Ratio</strong>: Characters per token (higher = more efficient). BPE typically achieves 3-5x vs character-level at 1.0x</p></li>
<li><p><strong>Vocabulary Utilization</strong>: unique_tokens / vocab_size indicates whether vocabulary is appropriately sized</p></li>
<li><p><strong>Sequence Length</strong>: Directly impacts transformer computation (O(n²) attention complexity)</p></li>
</ul>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Permalink to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Permalink to this heading">#</a></h3>
<p>Ensure you understand tensor operations from Module 01:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>bin/activate-tinytorch.sh
<span class="c1"># Verify tensor module</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>--module<span class="w"> </span>tensor
</pre></div>
</div>
<p><strong>Why This Prerequisite Matters:</strong></p>
<ul class="simple">
<li><p>Tokenization produces integer tensors (sequences of token IDs)</p></li>
<li><p>Embedding layers (Module 11) use token IDs to index into weight matrices</p></li>
<li><p>Understanding tensor shapes is critical for batching variable-length sequences</p></li>
</ul>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Permalink to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/10_tokenization/tokenization_dev.ipynb</span></code></p></li>
<li><p><strong>Implement base Tokenizer interface</strong>: Define encode() and decode() methods as abstract interface</p></li>
<li><p><strong>Build CharTokenizer</strong>: Implement vocabulary building, character-to-ID mappings, encode/decode with unknown token handling</p></li>
<li><p><strong>Implement BPE algorithm</strong>:</p>
<ul class="simple">
<li><p>Character pair counting with frequency statistics</p></li>
<li><p>Iterative merge logic (find most frequent pair, merge across corpus)</p></li>
<li><p>Vocabulary construction from learned merges</p></li>
<li><p>Merge application during encoding</p></li>
</ul>
</li>
<li><p><strong>Create utility functions</strong>: Tokenizer factory, dataset processing, performance analysis</p></li>
<li><p><strong>Test on real data</strong>:</p>
<ul class="simple">
<li><p>Compare character vs BPE on sequence length reduction</p></li>
<li><p>Measure compression ratios (characters per token)</p></li>
<li><p>Test unknown word handling via subword decomposition</p></li>
<li><p>Analyze vocabulary utilization</p></li>
</ul>
</li>
<li><p><strong>Optimize for performance</strong>: Measure tokenization throughput (tokens/second), profile merge application, test on large corpora</p></li>
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">10</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">--module</span> <span class="pre">tokenization</span></code></p></li>
</ol>
<p><strong>Development Tips:</strong></p>
<ul class="simple">
<li><p>Start with small corpus (100 words, vocab_size=200) to debug BPE algorithm</p></li>
<li><p>Print learned merge rules to understand what patterns BPE discovers</p></li>
<li><p>Visualize sequence length vs vocabulary size trade-off with multiple BPE configurations</p></li>
<li><p>Test on rare/misspelled words to verify subword decomposition works</p></li>
<li><p>Profile with different vocabulary sizes to find optimal performance point</p></li>
</ul>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Permalink to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Permalink to this heading">#</a></h3>
<p>Run the full test suite to verify tokenization functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>--module<span class="w"> </span>tokenization
<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>tokenization<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Permalink to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Base tokenizer interface</strong>: Abstract class enforces encode/decode contract</p></li>
<li><p><strong>Character tokenizer correctness</strong>: Vocabulary building from corpus, encode/decode round-trip accuracy, unknown character handling with <code class="docutils literal notranslate"><span class="pre">&lt;UNK&gt;</span></code> token</p></li>
<li><p><strong>BPE merge learning</strong>: Pair frequency counting, merge application correctness, vocabulary size convergence, merge order preservation</p></li>
<li><p><strong>Vocabulary management</strong>: Token-to-ID mapping consistency, bidirectional lookup correctness, special token ID reservation</p></li>
<li><p><strong>Edge case handling</strong>: Empty strings, single characters, Unicode characters, whitespace-only text, very long sequences</p></li>
<li><p><strong>Round-trip accuracy</strong>: Encode→decode produces original text for all vocabulary characters</p></li>
<li><p><strong>Performance benchmarks</strong>: Tokenization throughput (tokens/second), vocabulary size vs encode time scaling, batch processing efficiency</p></li>
</ul>
</section>
<section id="inline-testing-validation">
<h3>Inline Testing &amp; Validation<a class="headerlink" href="#inline-testing-validation" title="Permalink to this heading">#</a></h3>
<p>The module includes comprehensive inline tests with progress tracking:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span># Example inline test output
🔬 Unit Test: Base Tokenizer Interface...
✅ encode() raises NotImplementedError correctly
✅ decode() raises NotImplementedError correctly
📈 Progress: Base Tokenizer Interface ✓
🔬 Unit Test: Character Tokenizer...
✅ Vocabulary built with 89 unique characters
✅ Encode/decode round-trip: &quot;hello&quot; → [8,5,12,12,15] → &quot;hello&quot;
✅ Unknown character maps to &lt;UNK&gt; token (ID 0)
✅ Vocabulary building from corpus works correctly
📈 Progress: Character Tokenizer ✓
🔬 Unit Test: BPE Tokenizer...
✅ Character-level initialization successful
✅ Pair extraction: [&#39;h&#39;,&#39;e&#39;,&#39;l&#39;,&#39;l&#39;,&#39;o&lt;/w&gt;&#39;] → {(&#39;h&#39;,&#39;e&#39;), (&#39;l&#39;,&#39;l&#39;), ...}
✅ Training learned 195 merge rules from corpus
✅ Vocabulary size reached target (200 tokens)
✅ Sequence length reduced 3.2x vs character-level
✅ Unknown words decompose into subwords gracefully
📈 Progress: BPE Tokenizer ✓
🔬 Unit Test: Tokenization Utils...
✅ Tokenizer factory creates correct instances
✅ Dataset processing handles variable lengths
✅ Analysis computes compression ratios correctly
📈 Progress: Tokenization Utils ✓
📊 Analyzing Tokenization Strategies...
Strategy Vocab Avg Len Compression Coverage
------------------------------------------------------------
Character 89 43.2 1.00 89
BPE-100 100 28.5 1.52 87
BPE-500 500 13.8 3.14 245
💡 Key Insights:
- Character: Small vocab, long sequences, perfect coverage
- BPE: Larger vocab, shorter sequences, better compression
- Higher compression ratio = more characters per token = efficiency
🎉 ALL TESTS PASSED! Module ready for export.
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Permalink to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">tokenization_dev</span><span class="w"> </span><span class="kn">import</span> <span class="n">CharTokenizer</span><span class="p">,</span> <span class="n">BPETokenizer</span><span class="p">,</span> <span class="n">create_tokenizer</span><span class="p">,</span> <span class="n">analyze_tokenization</span>
<span class="c1"># Test character-level tokenization</span>
<span class="n">char_tokenizer</span> <span class="o">=</span> <span class="n">CharTokenizer</span><span class="p">()</span>
<span class="n">corpus</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;hello world&quot;</span><span class="p">,</span> <span class="s2">&quot;machine learning is awesome&quot;</span><span class="p">]</span>
<span class="n">char_tokenizer</span><span class="o">.</span><span class="n">build_vocab</span><span class="p">(</span><span class="n">corpus</span><span class="p">)</span>
<span class="n">text</span> <span class="o">=</span> <span class="s2">&quot;hello&quot;</span>
<span class="n">char_ids</span> <span class="o">=</span> <span class="n">char_tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="n">char_decoded</span> <span class="o">=</span> <span class="n">char_tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">char_ids</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Character: &#39;</span><span class="si">{</span><span class="n">text</span><span class="si">}</span><span class="s2">&#39;</span><span class="si">{</span><span class="n">char_ids</span><span class="si">}</span><span class="s2">&#39;</span><span class="si">{</span><span class="n">char_decoded</span><span class="si">}</span><span class="s2">&#39;&quot;</span><span class="p">)</span>
<span class="c1"># Output: Character: &#39;hello&#39; → [8, 5, 12, 12, 15] → &#39;hello&#39;</span>
<span class="c1"># Test BPE tokenization</span>
<span class="n">bpe_tokenizer</span> <span class="o">=</span> <span class="n">BPETokenizer</span><span class="p">(</span><span class="n">vocab_size</span><span class="o">=</span><span class="mi">500</span><span class="p">)</span>
<span class="n">bpe_tokenizer</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">corpus</span><span class="p">)</span>
<span class="n">bpe_ids</span> <span class="o">=</span> <span class="n">bpe_tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="n">bpe_decoded</span> <span class="o">=</span> <span class="n">bpe_tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">bpe_ids</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;BPE: &#39;</span><span class="si">{</span><span class="n">text</span><span class="si">}</span><span class="s2">&#39;</span><span class="si">{</span><span class="n">bpe_ids</span><span class="si">}</span><span class="s2">&#39;</span><span class="si">{</span><span class="n">bpe_decoded</span><span class="si">}</span><span class="s2">&#39;&quot;</span><span class="p">)</span>
<span class="c1"># Output: BPE: &#39;hello&#39; → [142, 201] → &#39;hello&#39; # Fewer tokens!</span>
<span class="c1"># Compare sequence lengths</span>
<span class="n">long_text</span> <span class="o">=</span> <span class="s2">&quot;The quick brown fox jumps over the lazy dog&quot;</span> <span class="o">*</span> <span class="mi">10</span>
<span class="n">char_len</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">char_tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">long_text</span><span class="p">))</span>
<span class="n">bpe_len</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">bpe_tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">long_text</span><span class="p">))</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Sequence length reduction: </span><span class="si">{</span><span class="n">char_len</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="n">bpe_len</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2">x&quot;</span><span class="p">)</span>
<span class="c1"># Output: Sequence length reduction: 3.2x</span>
<span class="c1"># Analyze tokenization statistics</span>
<span class="n">test_corpus</span> <span class="o">=</span> <span class="p">[</span>
<span class="s2">&quot;Neural networks learn patterns&quot;</span><span class="p">,</span>
<span class="s2">&quot;Transformers use attention mechanisms&quot;</span><span class="p">,</span>
<span class="s2">&quot;Tokenization enables text processing&quot;</span>
<span class="p">]</span>
<span class="n">char_stats</span> <span class="o">=</span> <span class="n">analyze_tokenization</span><span class="p">(</span><span class="n">test_corpus</span><span class="p">,</span> <span class="n">char_tokenizer</span><span class="p">)</span>
<span class="n">bpe_stats</span> <span class="o">=</span> <span class="n">analyze_tokenization</span><span class="p">(</span><span class="n">test_corpus</span><span class="p">,</span> <span class="n">bpe_tokenizer</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Character - Vocab: </span><span class="si">{</span><span class="n">char_stats</span><span class="p">[</span><span class="s1">&#39;vocab_size&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">, &quot;</span>
<span class="sa">f</span><span class="s2">&quot;Avg Length: </span><span class="si">{</span><span class="n">char_stats</span><span class="p">[</span><span class="s1">&#39;avg_sequence_length&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2">, &quot;</span>
<span class="sa">f</span><span class="s2">&quot;Compression: </span><span class="si">{</span><span class="n">char_stats</span><span class="p">[</span><span class="s1">&#39;compression_ratio&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Output: Character - Vocab: 89, Avg Length: 42.3, Compression: 1.00</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;BPE - Vocab: </span><span class="si">{</span><span class="n">bpe_stats</span><span class="p">[</span><span class="s1">&#39;vocab_size&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">, &quot;</span>
<span class="sa">f</span><span class="s2">&quot;Avg Length: </span><span class="si">{</span><span class="n">bpe_stats</span><span class="p">[</span><span class="s1">&#39;avg_sequence_length&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2">, &quot;</span>
<span class="sa">f</span><span class="s2">&quot;Compression: </span><span class="si">{</span><span class="n">bpe_stats</span><span class="p">[</span><span class="s1">&#39;compression_ratio&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Output: BPE - Vocab: 500, Avg Length: 13.5, Compression: 3.13</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Permalink to this heading">#</a></h2>
<section id="real-world-applications">
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Permalink to this heading">#</a></h3>
<p><strong>OpenAI GPT Series:</strong></p>
<ul class="simple">
<li><p><strong>GPT-2</strong>: 50,257 BPE tokens trained on 8M web pages (WebText corpus); vocabulary size chosen to balance 38M embedding parameters (50K × 768 dim) with sequence length for 1024-token context</p></li>
<li><p><strong>GPT-3</strong>: Increased to 100K vocabulary to handle code (indentation, operators) and reduce sequence lengths for long documents; embedding matrix alone: 1.2B parameters (100K × 12,288 dim)</p></li>
<li><p><strong>GPT-4</strong>: Advanced tiktoken library with 100K+ tokens, optimized for tokenization throughput at scale ($700/million tokens means every millisecond counts)</p></li>
<li><p><strong>Question</strong>: Why did OpenAI double vocabulary size from GPT-2→GPT-3? Consider the trade-off: 2x more embedding parameters vs sequence length reduction for code/long documents. What breaks if vocabulary is too small? Too large?</p></li>
</ul>
<p><strong>Google Multilingual Models:</strong></p>
<ul class="simple">
<li><p><strong>SentencePiece</strong>: Used in BERT, T5, PaLM for 100+ languages without language-specific preprocessing; unified tokenization enables shared vocabulary across languages</p></li>
<li><p><strong>Vocabulary Sharing</strong>: Multilingual models use single vocabulary for all languages (e.g., mT5: 250K SentencePiece tokens cover 101 languages); trade-off between per-language coverage and total vocabulary size</p></li>
<li><p><strong>Production Scaling</strong>: Google Translate processes billions of sentences daily; tokenization throughput and vocabulary lookup latency are critical for serving at scale</p></li>
<li><p><strong>Question</strong>: English needs ~30K tokens for 99% coverage; Chinese ideographic characters need 50K+. Should a multilingual model use one shared vocabulary or separate vocabularies per language? Consider: shared vocabulary enables zero-shot transfer but reduces per-language coverage.</p></li>
</ul>
<p><strong>Code Models (GitHub Copilot, AlphaCode):</strong></p>
<ul class="simple">
<li><p><strong>Specialized Vocabularies</strong>: Code tokenizers handle programming language syntax (indentation, operators, keywords) and natural language (comments, docstrings); balance code-specific tokens vs natural language</p></li>
<li><p><strong>Identifier Handling</strong>: Variable names like <code class="docutils literal notranslate"><span class="pre">getUserProfile</span></code> vs <code class="docutils literal notranslate"><span class="pre">get_user_profile</span></code> require different tokenization strategies (camelCase splitting, underscore boundaries)</p></li>
<li><p><strong>Trade-off</strong>: Larger vocabulary for code-specific tokens reduces sequence length but increases embedding matrix size; rare identifier fragments still need subword decomposition</p></li>
<li><p><strong>Question</strong>: Should a code tokenizer treat <code class="docutils literal notranslate"><span class="pre">getUserProfile</span></code> as 1 token, 3 tokens (<code class="docutils literal notranslate"><span class="pre">get</span></code>, <code class="docutils literal notranslate"><span class="pre">User</span></code>, <code class="docutils literal notranslate"><span class="pre">Profile</span></code>), or 15 character tokens? Consider: single token = short sequence but huge vocabulary; character-level = long sequences but handles any identifier.</p></li>
</ul>
<p><strong>Production NLP Pipelines:</strong></p>
<ul class="simple">
<li><p><strong>Google Translate</strong>: Billions of sentences daily require high-throughput tokenization (character: ~1M tokens/sec, BPE: ~100K tokens/sec); vocabulary size affects both model memory and inference speed</p></li>
<li><p><strong>OpenAI API</strong>: Tokenization cost is significant at $700/million tokens; every optimization (caching, batch processing, vocabulary size tuning) directly impacts economics</p></li>
<li><p><strong>Mobile Deployment</strong>: Edge models (on-device speech recognition, keyboards) use smaller vocabularies (5K-10K) to fit memory constraints, trading sequence length for model size</p></li>
<li><p><strong>Question</strong>: If your tokenizer processes 10K tokens/second but your model serves 100K requests/second (each 50 tokens), how do you scale? Consider: pre-tokenize and cache? Batch aggressively? Optimize vocabulary?</p></li>
</ul>
</section>
<section id="tokenization-foundations">
<h3>Tokenization Foundations<a class="headerlink" href="#tokenization-foundations" title="Permalink to this heading">#</a></h3>
<p><strong>Vocabulary Size vs Model Parameters:</strong></p>
<ul class="simple">
<li><p><strong>Embedding Matrix Scaling</strong>: Embedding parameters = vocab_size × embed_dim</p>
<ul>
<li><p>GPT-2: 50K vocab × 768 dim = 38.4M parameters (just embeddings!)</p></li>
<li><p>GPT-3: 100K vocab × 12,288 dim = 1.23B parameters (just embeddings!)</p></li>
<li><p>BERT-base: 30K vocab × 768 dim = 23M parameters</p></li>
</ul>
</li>
<li><p><strong>Training Impact</strong>: Larger vocabulary means more parameters to train; embedding gradients scale with vocabulary size (affects memory and optimizer state size)</p></li>
<li><p><strong>Deployment Constraints</strong>: Embedding matrix must fit in memory during inference; on-device models use smaller vocabularies (5K-10K) to meet memory budgets</p></li>
<li><p><strong>Question</strong>: If you increase vocabulary from 10K to 100K (10x), how does this affect: (1) Model size? (2) Training memory (gradients + optimizer states)? (3) Inference latency (vocabulary lookup)?</p></li>
</ul>
<p><strong>Sequence Length vs Computation:</strong></p>
<ul class="simple">
<li><p><strong>Transformer Attention Complexity</strong>: O(n²) where n = sequence length; doubling sequence length quadruples attention computation</p></li>
<li><p><strong>BPE Compression</strong>: Reduces “unhappiness” (11 chars) to [“un”, “happi”, “ness”] (3 tokens) → 13.4x less attention computation (11² vs 3²)</p></li>
<li><p><strong>Batch Processing</strong>: Sequences padded to max length in batch; character-level (1000 tokens) requires 11x more computation than BPE-level (300 tokens) even if actual content is shorter</p></li>
<li><p><strong>Memory Scaling</strong>: Attention matrices scale as (batch_size × n²); character-level consumes far more GPU memory than BPE</p></li>
<li><p><strong>Question</strong>: Given text “machine learning” (16 chars), compare computation: (1) Character tokenizer → 16 tokens → 16² = 256 attention ops; (2) BPE → 3 tokens → 3² = 9 attention ops. Whats the computational savings ratio? How does this scale to 1000-token documents?</p></li>
</ul>
<p><strong>Rare Word Handling:</strong></p>
<ul class="simple">
<li><p><strong>Word-Level Failure</strong>: Word tokenizers map unknown words to <code class="docutils literal notranslate"><span class="pre">&lt;UNK&gt;</span></code> token → complete information loss (cant distinguish “antidisestablishmentarianism” from “supercalifragilisticexpialidocious”)</p></li>
<li><p><strong>BPE Graceful Degradation</strong>: Decomposes unknown words into known subwords: “unhappiness” → [“un”, “happi”, “ness”] preserves semantic information even if full word never seen during training</p></li>
<li><p><strong>Morphological Generalization</strong>: BPE learns prefixes (“un-”, “pre-”, “anti-”) and suffixes (“-ing”, “-ed”, “-ness”) as tokens, enabling compositional understanding</p></li>
<li><p><strong>Question</strong>: How does BPE handle “antidisestablishmentarianism” (28 chars) even if never seen during training? Trace the decomposition: which subwords would be discovered? How does this enable the model to understand the words meaning?</p></li>
</ul>
<p><strong>Tokenization as Compression:</strong></p>
<ul class="simple">
<li><p><strong>Frequent Pattern Learning</strong>: BPE learns common patterns become single tokens: “ing” → 1 token, “ed” → 1 token, “tion” → 1 token (similar to dictionary-based compression like LZW)</p></li>
<li><p><strong>Information Theory Connection</strong>: Optimal encoding assigns short codes to frequent symbols (Huffman coding); BPE is essentially dictionary-based compression optimized for language statistics</p></li>
<li><p><strong>Compression Ratio</strong>: Character-level = 1.0 chars/token (by definition); BPE typically achieves 3-5 chars/token depending on vocabulary size and language</p></li>
<li><p><strong>Question</strong>: BPE and gzip both learn frequent patterns and replace with short codes. Whats the key difference? Hint: BPE operates at subword granularity (preserves linguistic units), gzip operates at byte level (ignores linguistic structure).</p></li>
</ul>
</section>
<section id="performance-characteristics">
<h3>Performance Characteristics<a class="headerlink" href="#performance-characteristics" title="Permalink to this heading">#</a></h3>
<p><strong>Tokenization Throughput:</strong></p>
<ul class="simple">
<li><p><strong>Character-Level Speed</strong>: ~1M tokens/second (simple array lookup: char → ID via hash map)</p></li>
<li><p><strong>BPE Speed</strong>: ~100K tokens/second (iterative merge application: must scan for applicable merge rules)</p></li>
<li><p><strong>Production Caching</strong>: Systems cache tokenization results to amortize preprocessing cost (especially for repeated queries or batch processing)</p></li>
<li><p><strong>Bottleneck Analysis</strong>: If tokenization takes 10ms and model inference takes 100ms (single request), tokenization is 9% overhead; but for batch_size=1000, tokenization becomes 100ms (10ms × 1000 requests) while model inference might be 200ms due to batching efficiency → tokenization is now 33% overhead!</p></li>
<li><p><strong>Question</strong>: Your tokenizer processes 10K tokens/sec. Model serves 100K requests/sec, each request has 50 tokens. Total tokenization throughput needed: 5M tokens/sec. What do you do? Consider: (1) Parallelize tokenization across CPUs? (2) Cache frequent queries? (3) Switch to character tokenizer (10x faster)? (4) Optimize BPE implementation?</p></li>
</ul>
<p><strong>Memory vs Compute Trade-offs:</strong></p>
<ul class="simple">
<li><p><strong>Large Vocabulary</strong>: More memory (embedding matrix) but faster tokenization (fewer merge applications) and shorter sequences (less attention computation)</p></li>
<li><p><strong>Small Vocabulary</strong>: Less memory (smaller embedding matrix) but slower tokenization (more merge rules to apply) and longer sequences (more attention computation)</p></li>
<li><p><strong>Optimal Vocabulary Size</strong>: Depends on deployment constraints—edge devices (mobile, IoT) prioritize memory (use smaller vocab, accept longer sequences); cloud serving prioritizes throughput (use larger vocab, reduce sequence length)</p></li>
<li><p><strong>Embedding Matrix Memory</strong>: GPT-3s 100K vocabulary × 12,288 dim × 2 bytes (fp16) = 2.5GB just for embeddings; quantization to int8 reduces to 1.25GB</p></li>
<li><p><strong>Question</strong>: For edge deployment (mobile device with 2GB RAM budget), should you prioritize: (1) Smaller vocabulary (5K tokens, saves 400MB embedding memory) accepting longer sequences? (2) Larger vocabulary (50K tokens, uses 2GB embeddings) for shorter sequences? Consider: attention computation scales quadratically with sequence length.</p></li>
</ul>
<p><strong>Batching and Padding:</strong></p>
<ul class="simple">
<li><p><strong>Padding Waste</strong>: Variable-length sequences padded to max length in batch; wasted computation on padding tokens (dont contribute to loss but consume attention operations)</p></li>
<li><p><strong>Character-Level Penalty</strong>: Longer sequences require more padding—if batch contains [10, 50, 500] character-level tokens, all padded to 500 → 490 + 450 + 0 = 940 wasted tokens (65% waste)</p></li>
<li><p><strong>BPE Advantage</strong>: Shorter sequences reduce padding waste—same batch as [3, 15, 150] BPE tokens, padded to 150 → 147 + 135 + 0 = 282 wasted tokens (still 63% waste, but absolute numbers smaller)</p></li>
<li><p><strong>Dynamic Batching</strong>: Group similar-length sequences to reduce padding waste (collate_fn in DataLoader)</p></li>
<li><p><strong>Question</strong>: Batch of sequences with lengths [10, 50, 500] tokens. (1) Character-level: Total computation = 3 × 500² = 750K attention operations. (2) BPE reduces to [3, 15, 150]: Total = 3 × 150² = 67.5K operations (11x reduction). But what if you sort and batch by length: [[10, 50], [500]] → Char: 2×50² + 1×500² = 255K; BPE: 2×15² + 1×150² = 23K. How much does batching strategy matter?</p></li>
</ul>
<p><strong>Multilingual Considerations:</strong></p>
<ul class="simple">
<li><p><strong>Shared Vocabulary</strong>: Enables zero-shot cross-lingual transfer (model trained on English can handle French without fine-tuning) but reduces per-language coverage</p></li>
<li><p><strong>Language-Specific Vocabulary Size</strong>: English: 26 letters → 30K tokens for 99% coverage; Chinese: 50K+ characters → need 60K tokens for equivalent coverage; Arabic: morphologically rich → needs more subword decomposition</p></li>
<li><p><strong>Vocabulary Allocation</strong>: Multilingual model with 100K shared vocabulary must allocate tokens across languages; high-resource languages (English) get better coverage than low-resource languages (Swahili)</p></li>
<li><p><strong>Question</strong>: Should a multilingual model use: (1) One shared vocabulary (100K tokens across all languages, enables transfer but dilutes per-language coverage)? (2) Separate vocabularies per language (30K English + 60K Chinese = 90K total, better coverage but no cross-lingual transfer)? Consider: shared embedding space enables “cat” (English) to align with “chat” (French) via training.</p></li>
</ul>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Permalink to this heading">#</a></h2>
<p>Youre about to implement the tokenization systems that power every modern language model—from GPT-4 processing trillions of tokens to Google Translate serving billions of requests daily. Tokenization is the critical bridge between human language (text) and neural networks (numbers), and the design decisions you make have profound effects on model size, computational cost, and generalization ability.</p>
<p>By building these systems from scratch, youll understand the fundamental trade-off shaping modern NLP: <strong>vocabulary size vs sequence length</strong>. Larger vocabularies mean more model parameters (embedding matrix size = vocab_size × embed_dim) but shorter sequences (less computation, especially in transformers with O(n²) attention). Smaller vocabularies mean fewer parameters but longer sequences requiring more computation. Youll see why BPE emerged as the dominant approach—balancing both extremes optimally through learned subword decomposition—and why every major language model (GPT, BERT, T5, LLaMA) uses some form of subword tokenization.</p>
<p>This module connects directly to Module 11 (Embeddings): your token IDs will index into embedding matrices, converting discrete tokens into continuous vectors. Understanding tokenization deeply—not just as a black-box API but as a system with measurable performance characteristics and design trade-offs—will make you a better ML systems engineer. Youll appreciate why GPT-3 doubled vocabulary size from GPT-2 (50K→100K to handle code and long documents), why mobile models use tiny 5K vocabularies (memory constraints), and why production systems aggressively cache tokenization results (throughput optimization).</p>
<p>Take your time, experiment with different vocabulary sizes (100, 1000, 10000), and measure everything: sequence length reduction, compression ratios, tokenization throughput. This is where text becomes numbers, where linguistics meets systems engineering, and where youll develop the intuition needed to make smart trade-offs in production NLP systems.</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
🚀 Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
</div>
<a class="sd-stretched-link reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/10_tokenization/tokenization_dev.ipynb"></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
⚡ Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/10_tokenization/tokenization_dev.ipynb"></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
📖 View Source</div>
<p class="sd-card-text">Browse the Jupyter notebook and understand the implementation.</p>
</div>
<a class="sd-stretched-link reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/10_tokenization/tokenization_dev.ipynb"></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">💾 Save Your Progress</p>
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../chapters/09_spatial.html" title="previous page">← Previous Module</a>
<a class="right-next" href="../chapters/11_embeddings.html" title="next page">Next Module →</a>
</div>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./modules"
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="09_spatial_ABOUT.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">09. Spatial Operations</p>
</div>
</a>
<a class="right-next"
href="11_embeddings_ABOUT.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">11. Embeddings - Token to Vector Representations</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-reflect">Build → Use → Reflect</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#base-tokenizer-interface">Base Tokenizer Interface</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#character-level-tokenizer">Character-Level Tokenizer</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#bpe-byte-pair-encoding-tokenizer">BPE (Byte Pair Encoding) Tokenizer</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tokenization-utilities">Tokenization Utilities</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tokenization-foundations">Tokenization Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2025.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>