mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 15:13:23 -05:00
fix: Add missing typing imports to Module 10 tokenization
Issue: CharTokenizer was failing with NameError: name 'List' is not defined Root cause: typing imports were not marked with #| export Fix: ✅ Added #| export directive to import block in tokenization_dev.py ✅ Re-exported module using 'tito export 10_tokenization' ✅ typing.List, Dict, Tuple, Optional, Set now properly exported Verification: - CharTokenizer.build_vocab() works ✅ - encode() and decode() work ✅ - Tested on Shakespeare sample text ✅ This fixes the integration with vaswani_shakespeare.py which now properly uses CharTokenizer from Module 10 instead of manual tokenization.
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b7c61b46",
|
||||
"id": "bbeed6a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -13,7 +13,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8addd72f",
|
||||
"id": "ab628a0c",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -45,7 +45,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7651c93b",
|
||||
"id": "542171ad",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -70,10 +70,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "40820d50",
|
||||
"id": "6fe4fe02",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#| export\n",
|
||||
"import numpy as np\n",
|
||||
"from typing import List, Dict, Tuple, Optional, Set\n",
|
||||
"import json\n",
|
||||
@@ -81,15 +82,12 @@
|
||||
"from collections import defaultdict, Counter\n",
|
||||
"\n",
|
||||
"# Import only Module 01 (Tensor) - this module has minimal dependencies\n",
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))\n",
|
||||
"from tensor_dev import Tensor"
|
||||
"from tinytorch.core.tensor import Tensor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "443dd927",
|
||||
"id": "ba7349a9",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -100,23 +98,40 @@
|
||||
"\n",
|
||||
"### The Text-to-Numbers Challenge\n",
|
||||
"\n",
|
||||
"Consider the sentence: \"Hello, world!\"\n",
|
||||
"Consider the sentence: \"Hello, world!\" - how do we turn this into numbers a neural network can process?\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"Human Text: \"Hello, world!\"\n",
|
||||
" ↓\n",
|
||||
" [Tokenization]\n",
|
||||
" ↓\n",
|
||||
"Numerical IDs: [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33]\n",
|
||||
"┌─────────────────────────────────────────────────────────────────┐\n",
|
||||
"│ TOKENIZATION PIPELINE: Text → Numbers │\n",
|
||||
"├─────────────────────────────────────────────────────────────────┤\n",
|
||||
"│ │\n",
|
||||
"│ Input (Human Text): \"Hello, world!\" │\n",
|
||||
"│ │ │\n",
|
||||
"│ ├─ Step 1: Split into tokens │\n",
|
||||
"│ │ ['H','e','l','l','o',',', ...'] │\n",
|
||||
"│ │ │\n",
|
||||
"│ ├─ Step 2: Map to vocabulary IDs │\n",
|
||||
"│ │ [72, 101, 108, 108, 111, ...] │\n",
|
||||
"│ │ │\n",
|
||||
"│ ├─ Step 3: Handle unknowns │\n",
|
||||
"│ │ Unknown chars → special <UNK> token │\n",
|
||||
"│ │ │\n",
|
||||
"│ └─ Step 4: Enable decoding │\n",
|
||||
"│ IDs → original text │\n",
|
||||
"│ │\n",
|
||||
"│ Output (Token IDs): [72, 101, 108, 108, 111, 44, 32, ...] │\n",
|
||||
"│ │\n",
|
||||
"└─────────────────────────────────────────────────────────────────┘\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"### The Four-Step Process\n",
|
||||
"\n",
|
||||
"How do we represent this for a neural network? We need to:\n",
|
||||
"1. **Split text into tokens** - meaningful units like words, subwords, or characters\n",
|
||||
"2. **Map tokens to integers** - create a vocabulary that assigns unique IDs\n",
|
||||
"3. **Handle unknown text** - deal with words not seen during training\n",
|
||||
"4. **Enable reconstruction** - convert numbers back to readable text\n",
|
||||
"How do we represent text for a neural network? We need a systematic pipeline:\n",
|
||||
"\n",
|
||||
"**1. Split text into tokens** - Break text into meaningful units (words, subwords, or characters)\n",
|
||||
"**2. Map tokens to integers** - Create a vocabulary that assigns each token a unique ID\n",
|
||||
"**3. Handle unknown text** - Deal gracefully with tokens not seen during training\n",
|
||||
"**4. Enable reconstruction** - Convert numbers back to readable text for interpretation\n",
|
||||
"\n",
|
||||
"### Why This Matters\n",
|
||||
"\n",
|
||||
@@ -129,7 +144,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7e997606",
|
||||
"id": "c39ef970",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -142,15 +157,59 @@
|
||||
"**Approach**: Each character gets its own token\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"Text: \"Hello world\"\n",
|
||||
" ↓\n",
|
||||
"Tokens: ['H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd']\n",
|
||||
" ↓\n",
|
||||
"IDs: [8, 5, 12, 12, 15, 0, 23, 15, 18, 12, 4]\n",
|
||||
"┌──────────────────────────────────────────────────────────────┐\n",
|
||||
"│ CHARACTER TOKENIZATION PROCESS │\n",
|
||||
"├──────────────────────────────────────────────────────────────┤\n",
|
||||
"│ │\n",
|
||||
"│ Step 1: Build Vocabulary from Unique Characters │\n",
|
||||
"│ ┌────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ Corpus: [\"hello\", \"world\"] │ │\n",
|
||||
"│ │ ↓ │ │\n",
|
||||
"│ │ Unique chars: ['h', 'e', 'l', 'o', 'w', 'r', 'd'] │ │\n",
|
||||
"│ │ ↓ │ │\n",
|
||||
"│ │ Vocabulary: ['<UNK>','h','e','l','o','w','r','d'] │ │\n",
|
||||
"│ │ IDs: 0 1 2 3 4 5 6 7 │ │\n",
|
||||
"│ └────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ Step 2: Encode Text Character by Character │\n",
|
||||
"│ ┌────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ Text: \"hello\" │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ 'h' → 1 (lookup in vocabulary) │ │\n",
|
||||
"│ │ 'e' → 2 │ │\n",
|
||||
"│ │ 'l' → 3 │ │\n",
|
||||
"│ │ 'l' → 3 │ │\n",
|
||||
"│ │ 'o' → 4 │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Result: [1, 2, 3, 3, 4] │ │\n",
|
||||
"│ └────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ Step 3: Decode by Reversing ID Lookup │\n",
|
||||
"│ ┌────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ IDs: [1, 2, 3, 3, 4] │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ 1 → 'h' (reverse lookup) │ │\n",
|
||||
"│ │ 2 → 'e' │ │\n",
|
||||
"│ │ 3 → 'l' │ │\n",
|
||||
"│ │ 3 → 'l' │ │\n",
|
||||
"│ │ 4 → 'o' │ |\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Result: \"hello\" │ │\n",
|
||||
"│ └────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"└──────────────────────────────────────────────────────────────┘\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Pros**: Small vocabulary (~100), handles any text, no unknown tokens\n",
|
||||
"**Cons**: Long sequences (1 char = 1 token), limited semantic understanding\n",
|
||||
"**Pros**: \n",
|
||||
"- Small vocabulary (~100 chars)\n",
|
||||
"- Handles any text perfectly\n",
|
||||
"- No unknown tokens (every character can be mapped)\n",
|
||||
"- Simple implementation\n",
|
||||
"\n",
|
||||
"**Cons**: \n",
|
||||
"- Long sequences (1 character = 1 token)\n",
|
||||
"- Limited semantic understanding (no word boundaries)\n",
|
||||
"- More compute (longer sequences to process)\n",
|
||||
"\n",
|
||||
"### Word-Level Tokenization\n",
|
||||
"**Approach**: Each word gets its own token\n",
|
||||
@@ -197,7 +256,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fc75101c",
|
||||
"id": "13b74a9d",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -209,7 +268,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d1057ce5",
|
||||
"id": "e8613976",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -231,7 +290,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fa4a37fa",
|
||||
"id": "bb58a938",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -294,7 +353,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8b107a19",
|
||||
"id": "ddded2c2",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -332,7 +391,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0207d72c",
|
||||
"id": "5f2f6599",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -374,7 +433,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c9b4e0b3",
|
||||
"id": "bdba5211",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -512,7 +571,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6fd3a515",
|
||||
"id": "037f2a1b",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -563,7 +622,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "addbc685",
|
||||
"id": "6ba4ae7f",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -579,7 +638,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eb9653c3",
|
||||
"id": "1e93979f",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -587,44 +646,90 @@
|
||||
"source": [
|
||||
"### Byte Pair Encoding (BPE) Tokenizer\n",
|
||||
"\n",
|
||||
"BPE is the secret sauce behind modern language models. It learns to merge frequent character pairs, creating subword units that balance vocabulary size with sequence length.\n",
|
||||
"BPE is the secret sauce behind modern language models (GPT, BERT, etc.). It learns to merge frequent character pairs, creating subword units that balance vocabulary size with sequence length.\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"BPE Training Process:\n",
|
||||
"\n",
|
||||
"Step 1: Start with character vocabulary\n",
|
||||
"Text: [\"hello\", \"hello\", \"help\"]\n",
|
||||
"Initial tokens: [['h','e','l','l','o</w>'], ['h','e','l','l','o</w>'], ['h','e','l','p</w>']]\n",
|
||||
"\n",
|
||||
"Step 2: Count character pairs\n",
|
||||
"('h','e'): 3 times ← Most frequent!\n",
|
||||
"('e','l'): 3 times\n",
|
||||
"('l','l'): 2 times\n",
|
||||
"('l','o'): 2 times\n",
|
||||
"('l','p'): 1 time\n",
|
||||
"\n",
|
||||
"Step 3: Merge most frequent pair\n",
|
||||
"Merge ('h','e') → 'he'\n",
|
||||
"Tokens: [['he','l','l','o</w>'], ['he','l','l','o</w>'], ['he','l','p</w>']]\n",
|
||||
"Vocab: ['h','e','l','o','p','</w>','he'] ← New token added\n",
|
||||
"\n",
|
||||
"Step 4: Repeat until target vocabulary size\n",
|
||||
"Next merge: ('l','l') → 'll'\n",
|
||||
"Tokens: [['he','ll','o</w>'], ['he','ll','o</w>'], ['he','l','p</w>']]\n",
|
||||
"Vocab: ['h','e','l','o','p','</w>','he','ll'] ← Growing vocabulary\n",
|
||||
"\n",
|
||||
"Final result:\n",
|
||||
"Text \"hello\" → ['he', 'll', 'o</w>'] → 3 tokens (vs 5 characters)\n",
|
||||
"Text \"help\" → ['he', 'l', 'p</w>'] → 3 tokens (vs 4 characters)\n",
|
||||
"┌───────────────────────────────────────────────────────────────────────────┐\n",
|
||||
"│ BPE TRAINING ALGORITHM: Learning Subword Units │\n",
|
||||
"├───────────────────────────────────────────────────────────────────────────┤\n",
|
||||
"│ │\n",
|
||||
"│ STEP 1: Initialize with Character Vocabulary │\n",
|
||||
"│ ┌──────────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ Training Data: [\"hello\", \"hello\", \"help\"] │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Initial Tokens (with end-of-word markers): │ │\n",
|
||||
"│ │ ['h','e','l','l','o</w>'] (hello) │ │\n",
|
||||
"│ │ ['h','e','l','l','o</w>'] (hello) │ │\n",
|
||||
"│ │ ['h','e','l','p</w>'] (help) │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Starting Vocab: ['h', 'e', 'l', 'o', 'p', '</w>'] │ │\n",
|
||||
"│ │ ↑ All unique characters │ │\n",
|
||||
"│ └──────────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ STEP 2: Count All Adjacent Pairs │\n",
|
||||
"│ ┌──────────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ Pair Frequency Analysis: │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ ('h', 'e'): ██████ 3 occurrences ← MOST FREQUENT! │ │\n",
|
||||
"│ │ ('e', 'l'): ██████ 3 occurrences │ │\n",
|
||||
"│ │ ('l', 'l'): ████ 2 occurrences │ │\n",
|
||||
"│ │ ('l', 'o'): ████ 2 occurrences │ │\n",
|
||||
"│ │ ('o', '<'): ████ 2 occurrences │ │\n",
|
||||
"│ │ ('l', 'p'): ██ 1 occurrence │ │\n",
|
||||
"│ │ ('p', '<'): ██ 1 occurrence │ │\n",
|
||||
"│ └──────────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ STEP 3: Merge Most Frequent Pair │\n",
|
||||
"│ ┌──────────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ Merge Operation: ('h', 'e') → 'he' │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ BEFORE: AFTER: │ │\n",
|
||||
"│ │ ['h','e','l','l','o</w>'] → ['he','l','l','o</w>'] │ │\n",
|
||||
"│ │ ['h','e','l','l','o</w>'] → ['he','l','l','o</w>'] │ │\n",
|
||||
"│ │ ['h','e','l','p</w>'] → ['he','l','p</w>'] │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Updated Vocab: ['h','e','l','o','p','</w>', 'he'] │ │\n",
|
||||
"│ │ ↑ NEW TOKEN! │ │\n",
|
||||
"│ └──────────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ STEP 4: Repeat Until Target Vocab Size Reached │\n",
|
||||
"│ ┌──────────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ Iteration 2: Next most frequent is ('l', 'l') │ │\n",
|
||||
"│ │ Merge ('l','l') → 'll' │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ ['he','l','l','o</w>'] → ['he','ll','o</w>'] │ │\n",
|
||||
"│ │ ['he','l','l','o</w>'] → ['he','ll','o</w>'] │ │\n",
|
||||
"│ │ ['he','l','p</w>'] → ['he','l','p</w>'] │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Updated Vocab: ['h','e','l','o','p','</w>','he','ll'] │ │\n",
|
||||
"│ │ ↑ NEW! │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Continue merging until vocab_size target... │ │\n",
|
||||
"│ └──────────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ FINAL RESULTS: │\n",
|
||||
"│ ┌──────────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ Trained BPE can now encode efficiently: │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ \"hello\" → ['he', 'll', 'o</w>'] = 3 tokens (vs 5 chars) │ │\n",
|
||||
"│ │ \"help\" → ['he', 'l', 'p</w>'] = 3 tokens (vs 4 chars) │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Key Insights: BPE automatically discovers: │ │\n",
|
||||
"│ │ - Common prefixes ('he') │ │\n",
|
||||
"│ │ - Morphological patterns ('ll') │ │\n",
|
||||
"│ │ - Natural word boundaries (</w>) │ │\n",
|
||||
"│ └──────────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"└───────────────────────────────────────────────────────────────────────────┘\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"BPE discovers natural word boundaries and common patterns automatically!"
|
||||
"**Why BPE Works**: By starting with characters and iteratively merging frequent pairs, BPE discovers the natural statistical patterns in language. Common words become single tokens, rare words split into recognizable subword pieces!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "95105bc9",
|
||||
"id": "89452d55",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -911,7 +1016,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "49023f77",
|
||||
"id": "2ceb9e28",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -966,7 +1071,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "be8ef10a",
|
||||
"id": "8e51f1a4",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -997,7 +1102,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12b3d35d",
|
||||
"id": "6d384f02",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -1019,7 +1124,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3dd1e90f",
|
||||
"id": "20ebcfe2",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -1131,7 +1236,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7f316410",
|
||||
"id": "3abc8dcd",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -1176,7 +1281,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a172584f",
|
||||
"id": "f8b901eb",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -1190,7 +1295,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bc583368",
|
||||
"id": "df2ae12e",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -1241,7 +1346,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "dfcdeeb7",
|
||||
"id": "f23d4b98",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -1281,17 +1386,63 @@
|
||||
"\n",
|
||||
"**Memory implications for embedding tables**:\n",
|
||||
"```\n",
|
||||
"Tokenizer Vocab Size Embed Dim Parameters Memory (fp32)\n",
|
||||
"Character 100 512 51K 204 KB\n",
|
||||
"BPE-1K 1,000 512 512K 2.0 MB\n",
|
||||
"BPE-50K 50,000 512 25.6M 102.4 MB\n",
|
||||
"Word-100K 100,000 512 51.2M 204.8 MB\n",
|
||||
"┌─────────────────────────────────────────────────────────────────────┐\n",
|
||||
"│ EMBEDDING TABLE MEMORY: Vocabulary Size × Embedding Dimension │\n",
|
||||
"├─────────────────────────────────────────────────────────────────────┤\n",
|
||||
"│ │\n",
|
||||
"│ CHARACTER TOKENIZER (Vocab: 100) │\n",
|
||||
"│ ┌────────────────────────────┐ │\n",
|
||||
"│ │ 100 × 512 = 51,200 params │ Memory: 204 KB │\n",
|
||||
"│ │ ████ │ ↑ Tiny embedding table! │\n",
|
||||
"│ └────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ BPE-SMALL (Vocab: 1,000) │\n",
|
||||
"│ ┌────────────────────────────┐ │\n",
|
||||
"│ │ 1K × 512 = 512K params │ Memory: 2.0 MB │\n",
|
||||
"│ │ ██████████ │ ↑ Still manageable │\n",
|
||||
"│ └────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ BPE-LARGE (Vocab: 50,000) ← MOST PRODUCTION MODELS │\n",
|
||||
"│ ┌────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ 50K × 512 = 25.6M params │ │\n",
|
||||
"│ │ ████████████████████████████████████████████████ │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Memory: 102.4 MB (fp32) │ │\n",
|
||||
"│ │ 51.2 MB (fp16) ← Half precision saves 50% │ │\n",
|
||||
"│ │ 25.6 MB (int8) ← Quantization saves 75% │ │\n",
|
||||
"│ └────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ WORD-LEVEL (Vocab: 100,000) │\n",
|
||||
"│ ┌────────────────────────────────────────────────────────┐ │\n",
|
||||
"│ │ 100K × 512 = 51.2M params │ │\n",
|
||||
"│ │ ████████████████████████████████████████████████████ │ │\n",
|
||||
"│ │ │ │\n",
|
||||
"│ │ Memory: 204.8 MB (fp32) ← Often too large! │ │\n",
|
||||
"│ │ 102.4 MB (fp16) │ │\n",
|
||||
"│ └────────────────────────────────────────────────────────┘ │\n",
|
||||
"│ │\n",
|
||||
"│ Key Trade-off: │\n",
|
||||
"│ Larger vocab → Shorter sequences → Less compute │\n",
|
||||
"│ BUT larger vocab → More embedding memory → Harder to train │\n",
|
||||
"│ │\n",
|
||||
"└─────────────────────────────────────────────────────────────────────┘\n",
|
||||
"\n",
|
||||
"Real-World Production Examples:\n",
|
||||
"┌─────────────┬──────────────┬───────────────┬──────────────────┐\n",
|
||||
"│ Model │ Vocab Size │ Embed Dim │ Embed Memory │\n",
|
||||
"├─────────────┼──────────────┼───────────────┼──────────────────┤\n",
|
||||
"│ GPT-2 │ 50,257 │ 1,600 │ 321 MB │\n",
|
||||
"│ GPT-3 │ 50,257 │ 12,288 │ 2.4 GB │\n",
|
||||
"│ BERT │ 30,522 │ 768 │ 94 MB │\n",
|
||||
"│ T5 │ 32,128 │ 512 │ 66 MB │\n",
|
||||
"│ LLaMA-7B │ 32,000 │ 4,096 │ 524 MB │\n",
|
||||
"└─────────────┴──────────────┴───────────────┴──────────────────┘\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "423df187",
|
||||
"id": "a7c5816a",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -1305,7 +1456,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6dceaa48",
|
||||
"id": "2f3cfd32",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -1397,7 +1548,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8bb055b5",
|
||||
"id": "9d68a974",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -1409,7 +1560,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "824eab53",
|
||||
"id": "b7885211",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -1441,7 +1592,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3eab9125",
|
||||
"id": "1c62fd5c",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
|
||||
@@ -62,6 +62,7 @@ from tinytorch.text.tokenization import Tokenizer, CharTokenizer, BPETokenizer
|
||||
"""
|
||||
|
||||
# %%
|
||||
#| export
|
||||
import numpy as np
|
||||
from typing import List, Dict, Tuple, Optional, Set
|
||||
import json
|
||||
|
||||
10
tinytorch/text/tokenization.py
generated
10
tinytorch/text/tokenization.py
generated
@@ -21,6 +21,16 @@ __all__ = ['Tokenizer', 'CharTokenizer', 'BPETokenizer']
|
||||
#| default_exp text.tokenization
|
||||
#| export
|
||||
|
||||
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 3
|
||||
import numpy as np
|
||||
from typing import List, Dict, Tuple, Optional, Set
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
# Import only Module 01 (Tensor) - this module has minimal dependencies
|
||||
from ..core.tensor import Tensor
|
||||
|
||||
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 8
|
||||
class Tokenizer:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user