diff --git a/modules/source/10_tokenization/tokenization_dev.ipynb b/modules/source/10_tokenization/tokenization_dev.ipynb index 6c4d64a2..2dde6104 100644 --- a/modules/source/10_tokenization/tokenization_dev.ipynb +++ b/modules/source/10_tokenization/tokenization_dev.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7c61b46", + "id": "bbeed6a9", "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "markdown", - "id": "8addd72f", + "id": "ab628a0c", "metadata": { "cell_marker": "\"\"\"" }, @@ -45,7 +45,7 @@ }, { "cell_type": "markdown", - "id": "7651c93b", + "id": "542171ad", "metadata": { "cell_marker": "\"\"\"" }, @@ -70,10 +70,11 @@ { "cell_type": "code", "execution_count": null, - "id": "40820d50", + "id": "6fe4fe02", "metadata": {}, "outputs": [], "source": [ + "#| export\n", "import numpy as np\n", "from typing import List, Dict, Tuple, Optional, Set\n", "import json\n", @@ -81,15 +82,12 @@ "from collections import defaultdict, Counter\n", "\n", "# Import only Module 01 (Tensor) - this module has minimal dependencies\n", - "import sys\n", - "import os\n", - "sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))\n", - "from tensor_dev import Tensor" + "from tinytorch.core.tensor import Tensor" ] }, { "cell_type": "markdown", - "id": "443dd927", + "id": "ba7349a9", "metadata": { "cell_marker": "\"\"\"" }, @@ -100,23 +98,40 @@ "\n", "### The Text-to-Numbers Challenge\n", "\n", - "Consider the sentence: \"Hello, world!\"\n", + "Consider the sentence: \"Hello, world!\" - how do we turn this into numbers a neural network can process?\n", "\n", "```\n", - "Human Text: \"Hello, world!\"\n", - " ↓\n", - " [Tokenization]\n", - " ↓\n", - "Numerical IDs: [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33]\n", + "┌─────────────────────────────────────────────────────────────────┐\n", + "│ TOKENIZATION PIPELINE: Text → Numbers │\n", + "├─────────────────────────────────────────────────────────────────┤\n", + "│ │\n", + "│ Input (Human Text): \"Hello, world!\" │\n", + "│ │ │\n", + "│ ├─ Step 1: Split into tokens │\n", + "│ │ ['H','e','l','l','o',',', ...'] │\n", + "│ │ │\n", + "│ ├─ Step 2: Map to vocabulary IDs │\n", + "│ │ [72, 101, 108, 108, 111, ...] │\n", + "│ │ │\n", + "│ ├─ Step 3: Handle unknowns │\n", + "│ │ Unknown chars → special token │\n", + "│ │ │\n", + "│ └─ Step 4: Enable decoding │\n", + "│ IDs → original text │\n", + "│ │\n", + "│ Output (Token IDs): [72, 101, 108, 108, 111, 44, 32, ...] │\n", + "│ │\n", + "└─────────────────────────────────────────────────────────────────┘\n", "```\n", "\n", "### The Four-Step Process\n", "\n", - "How do we represent this for a neural network? We need to:\n", - "1. **Split text into tokens** - meaningful units like words, subwords, or characters\n", - "2. **Map tokens to integers** - create a vocabulary that assigns unique IDs\n", - "3. **Handle unknown text** - deal with words not seen during training\n", - "4. **Enable reconstruction** - convert numbers back to readable text\n", + "How do we represent text for a neural network? We need a systematic pipeline:\n", + "\n", + "**1. Split text into tokens** - Break text into meaningful units (words, subwords, or characters)\n", + "**2. Map tokens to integers** - Create a vocabulary that assigns each token a unique ID\n", + "**3. Handle unknown text** - Deal gracefully with tokens not seen during training\n", + "**4. Enable reconstruction** - Convert numbers back to readable text for interpretation\n", "\n", "### Why This Matters\n", "\n", @@ -129,7 +144,7 @@ }, { "cell_type": "markdown", - "id": "7e997606", + "id": "c39ef970", "metadata": { "cell_marker": "\"\"\"" }, @@ -142,15 +157,59 @@ "**Approach**: Each character gets its own token\n", "\n", "```\n", - "Text: \"Hello world\"\n", - " ↓\n", - "Tokens: ['H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd']\n", - " ↓\n", - "IDs: [8, 5, 12, 12, 15, 0, 23, 15, 18, 12, 4]\n", + "┌──────────────────────────────────────────────────────────────┐\n", + "│ CHARACTER TOKENIZATION PROCESS │\n", + "├──────────────────────────────────────────────────────────────┤\n", + "│ │\n", + "│ Step 1: Build Vocabulary from Unique Characters │\n", + "│ ┌────────────────────────────────────────────────────────┐ │\n", + "│ │ Corpus: [\"hello\", \"world\"] │ │\n", + "│ │ ↓ │ │\n", + "│ │ Unique chars: ['h', 'e', 'l', 'o', 'w', 'r', 'd'] │ │\n", + "│ │ ↓ │ │\n", + "│ │ Vocabulary: ['','h','e','l','o','w','r','d'] │ │\n", + "│ │ IDs: 0 1 2 3 4 5 6 7 │ │\n", + "│ └────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "│ Step 2: Encode Text Character by Character │\n", + "│ ┌────────────────────────────────────────────────────────┐ │\n", + "│ │ Text: \"hello\" │ │\n", + "│ │ │ │\n", + "│ │ 'h' → 1 (lookup in vocabulary) │ │\n", + "│ │ 'e' → 2 │ │\n", + "│ │ 'l' → 3 │ │\n", + "│ │ 'l' → 3 │ │\n", + "│ │ 'o' → 4 │ │\n", + "│ │ │ │\n", + "│ │ Result: [1, 2, 3, 3, 4] │ │\n", + "│ └────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "│ Step 3: Decode by Reversing ID Lookup │\n", + "│ ┌────────────────────────────────────────────────────────┐ │\n", + "│ │ IDs: [1, 2, 3, 3, 4] │ │\n", + "│ │ │ │\n", + "│ │ 1 → 'h' (reverse lookup) │ │\n", + "│ │ 2 → 'e' │ │\n", + "│ │ 3 → 'l' │ │\n", + "│ │ 3 → 'l' │ │\n", + "│ │ 4 → 'o' │ |\n", + "│ │ │ │\n", + "│ │ Result: \"hello\" │ │\n", + "│ └────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "└──────────────────────────────────────────────────────────────┘\n", "```\n", "\n", - "**Pros**: Small vocabulary (~100), handles any text, no unknown tokens\n", - "**Cons**: Long sequences (1 char = 1 token), limited semantic understanding\n", + "**Pros**: \n", + "- Small vocabulary (~100 chars)\n", + "- Handles any text perfectly\n", + "- No unknown tokens (every character can be mapped)\n", + "- Simple implementation\n", + "\n", + "**Cons**: \n", + "- Long sequences (1 character = 1 token)\n", + "- Limited semantic understanding (no word boundaries)\n", + "- More compute (longer sequences to process)\n", "\n", "### Word-Level Tokenization\n", "**Approach**: Each word gets its own token\n", @@ -197,7 +256,7 @@ }, { "cell_type": "markdown", - "id": "fc75101c", + "id": "13b74a9d", "metadata": { "cell_marker": "\"\"\"" }, @@ -209,7 +268,7 @@ }, { "cell_type": "markdown", - "id": "d1057ce5", + "id": "e8613976", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -231,7 +290,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fa4a37fa", + "id": "bb58a938", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -294,7 +353,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8b107a19", + "id": "ddded2c2", "metadata": { "nbgrader": { "grade": true, @@ -332,7 +391,7 @@ }, { "cell_type": "markdown", - "id": "0207d72c", + "id": "5f2f6599", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -374,7 +433,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c9b4e0b3", + "id": "bdba5211", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -512,7 +571,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6fd3a515", + "id": "037f2a1b", "metadata": { "nbgrader": { "grade": true, @@ -563,7 +622,7 @@ }, { "cell_type": "markdown", - "id": "addbc685", + "id": "6ba4ae7f", "metadata": { "cell_marker": "\"\"\"" }, @@ -579,7 +638,7 @@ }, { "cell_type": "markdown", - "id": "eb9653c3", + "id": "1e93979f", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -587,44 +646,90 @@ "source": [ "### Byte Pair Encoding (BPE) Tokenizer\n", "\n", - "BPE is the secret sauce behind modern language models. It learns to merge frequent character pairs, creating subword units that balance vocabulary size with sequence length.\n", + "BPE is the secret sauce behind modern language models (GPT, BERT, etc.). It learns to merge frequent character pairs, creating subword units that balance vocabulary size with sequence length.\n", "\n", "```\n", - "BPE Training Process:\n", - "\n", - "Step 1: Start with character vocabulary\n", - "Text: [\"hello\", \"hello\", \"help\"]\n", - "Initial tokens: [['h','e','l','l','o'], ['h','e','l','l','o'], ['h','e','l','p']]\n", - "\n", - "Step 2: Count character pairs\n", - "('h','e'): 3 times ← Most frequent!\n", - "('e','l'): 3 times\n", - "('l','l'): 2 times\n", - "('l','o'): 2 times\n", - "('l','p'): 1 time\n", - "\n", - "Step 3: Merge most frequent pair\n", - "Merge ('h','e') → 'he'\n", - "Tokens: [['he','l','l','o'], ['he','l','l','o'], ['he','l','p']]\n", - "Vocab: ['h','e','l','o','p','','he'] ← New token added\n", - "\n", - "Step 4: Repeat until target vocabulary size\n", - "Next merge: ('l','l') → 'll'\n", - "Tokens: [['he','ll','o'], ['he','ll','o'], ['he','l','p']]\n", - "Vocab: ['h','e','l','o','p','','he','ll'] ← Growing vocabulary\n", - "\n", - "Final result:\n", - "Text \"hello\" → ['he', 'll', 'o'] → 3 tokens (vs 5 characters)\n", - "Text \"help\" → ['he', 'l', 'p'] → 3 tokens (vs 4 characters)\n", + "┌───────────────────────────────────────────────────────────────────────────┐\n", + "│ BPE TRAINING ALGORITHM: Learning Subword Units │\n", + "├───────────────────────────────────────────────────────────────────────────┤\n", + "│ │\n", + "│ STEP 1: Initialize with Character Vocabulary │\n", + "│ ┌──────────────────────────────────────────────────────────────┐ │\n", + "│ │ Training Data: [\"hello\", \"hello\", \"help\"] │ │\n", + "│ │ │ │\n", + "│ │ Initial Tokens (with end-of-word markers): │ │\n", + "│ │ ['h','e','l','l','o'] (hello) │ │\n", + "│ │ ['h','e','l','l','o'] (hello) │ │\n", + "│ │ ['h','e','l','p'] (help) │ │\n", + "│ │ │ │\n", + "│ │ Starting Vocab: ['h', 'e', 'l', 'o', 'p', ''] │ │\n", + "│ │ ↑ All unique characters │ │\n", + "│ └──────────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "│ STEP 2: Count All Adjacent Pairs │\n", + "│ ┌──────────────────────────────────────────────────────────────┐ │\n", + "│ │ Pair Frequency Analysis: │ │\n", + "│ │ │ │\n", + "│ │ ('h', 'e'): ██████ 3 occurrences ← MOST FREQUENT! │ │\n", + "│ │ ('e', 'l'): ██████ 3 occurrences │ │\n", + "│ │ ('l', 'l'): ████ 2 occurrences │ │\n", + "│ │ ('l', 'o'): ████ 2 occurrences │ │\n", + "│ │ ('o', '<'): ████ 2 occurrences │ │\n", + "│ │ ('l', 'p'): ██ 1 occurrence │ │\n", + "│ │ ('p', '<'): ██ 1 occurrence │ │\n", + "│ └──────────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "│ STEP 3: Merge Most Frequent Pair │\n", + "│ ┌──────────────────────────────────────────────────────────────┐ │\n", + "│ │ Merge Operation: ('h', 'e') → 'he' │ │\n", + "│ │ │ │\n", + "│ │ BEFORE: AFTER: │ │\n", + "│ │ ['h','e','l','l','o'] → ['he','l','l','o'] │ │\n", + "│ │ ['h','e','l','l','o'] → ['he','l','l','o'] │ │\n", + "│ │ ['h','e','l','p'] → ['he','l','p'] │ │\n", + "│ │ │ │\n", + "│ │ Updated Vocab: ['h','e','l','o','p','', 'he'] │ │\n", + "│ │ ↑ NEW TOKEN! │ │\n", + "│ └──────────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "│ STEP 4: Repeat Until Target Vocab Size Reached │\n", + "│ ┌──────────────────────────────────────────────────────────────┐ │\n", + "│ │ Iteration 2: Next most frequent is ('l', 'l') │ │\n", + "│ │ Merge ('l','l') → 'll' │ │\n", + "│ │ │ │\n", + "│ │ ['he','l','l','o'] → ['he','ll','o'] │ │\n", + "│ │ ['he','l','l','o'] → ['he','ll','o'] │ │\n", + "│ │ ['he','l','p'] → ['he','l','p'] │ │\n", + "│ │ │ │\n", + "│ │ Updated Vocab: ['h','e','l','o','p','','he','ll'] │ │\n", + "│ │ ↑ NEW! │ │\n", + "│ │ │ │\n", + "│ │ Continue merging until vocab_size target... │ │\n", + "│ └──────────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "│ FINAL RESULTS: │\n", + "│ ┌──────────────────────────────────────────────────────────────┐ │\n", + "│ │ Trained BPE can now encode efficiently: │ │\n", + "│ │ │ │\n", + "│ │ \"hello\" → ['he', 'll', 'o'] = 3 tokens (vs 5 chars) │ │\n", + "│ │ \"help\" → ['he', 'l', 'p'] = 3 tokens (vs 4 chars) │ │\n", + "│ │ │ │\n", + "│ │ Key Insights: BPE automatically discovers: │ │\n", + "│ │ - Common prefixes ('he') │ │\n", + "│ │ - Morphological patterns ('ll') │ │\n", + "│ │ - Natural word boundaries () │ │\n", + "│ └──────────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "└───────────────────────────────────────────────────────────────────────────┘\n", "```\n", "\n", - "BPE discovers natural word boundaries and common patterns automatically!" + "**Why BPE Works**: By starting with characters and iteratively merging frequent pairs, BPE discovers the natural statistical patterns in language. Common words become single tokens, rare words split into recognizable subword pieces!" ] }, { "cell_type": "code", "execution_count": null, - "id": "95105bc9", + "id": "89452d55", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -911,7 +1016,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49023f77", + "id": "2ceb9e28", "metadata": { "nbgrader": { "grade": true, @@ -966,7 +1071,7 @@ }, { "cell_type": "markdown", - "id": "be8ef10a", + "id": "8e51f1a4", "metadata": { "cell_marker": "\"\"\"" }, @@ -997,7 +1102,7 @@ }, { "cell_type": "markdown", - "id": "12b3d35d", + "id": "6d384f02", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1019,7 +1124,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3dd1e90f", + "id": "20ebcfe2", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1131,7 +1236,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f316410", + "id": "3abc8dcd", "metadata": { "nbgrader": { "grade": true, @@ -1176,7 +1281,7 @@ }, { "cell_type": "markdown", - "id": "a172584f", + "id": "f8b901eb", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1190,7 +1295,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc583368", + "id": "df2ae12e", "metadata": { "nbgrader": { "grade": false, @@ -1241,7 +1346,7 @@ }, { "cell_type": "markdown", - "id": "dfcdeeb7", + "id": "f23d4b98", "metadata": { "cell_marker": "\"\"\"" }, @@ -1281,17 +1386,63 @@ "\n", "**Memory implications for embedding tables**:\n", "```\n", - "Tokenizer Vocab Size Embed Dim Parameters Memory (fp32)\n", - "Character 100 512 51K 204 KB\n", - "BPE-1K 1,000 512 512K 2.0 MB\n", - "BPE-50K 50,000 512 25.6M 102.4 MB\n", - "Word-100K 100,000 512 51.2M 204.8 MB\n", + "┌─────────────────────────────────────────────────────────────────────┐\n", + "│ EMBEDDING TABLE MEMORY: Vocabulary Size × Embedding Dimension │\n", + "├─────────────────────────────────────────────────────────────────────┤\n", + "│ │\n", + "│ CHARACTER TOKENIZER (Vocab: 100) │\n", + "│ ┌────────────────────────────┐ │\n", + "│ │ 100 × 512 = 51,200 params │ Memory: 204 KB │\n", + "│ │ ████ │ ↑ Tiny embedding table! │\n", + "│ └────────────────────────────┘ │\n", + "│ │\n", + "│ BPE-SMALL (Vocab: 1,000) │\n", + "│ ┌────────────────────────────┐ │\n", + "│ │ 1K × 512 = 512K params │ Memory: 2.0 MB │\n", + "│ │ ██████████ │ ↑ Still manageable │\n", + "│ └────────────────────────────┘ │\n", + "│ │\n", + "│ BPE-LARGE (Vocab: 50,000) ← MOST PRODUCTION MODELS │\n", + "│ ┌────────────────────────────────────────────────────────┐ │\n", + "│ │ 50K × 512 = 25.6M params │ │\n", + "│ │ ████████████████████████████████████████████████ │ │\n", + "│ │ │ │\n", + "│ │ Memory: 102.4 MB (fp32) │ │\n", + "│ │ 51.2 MB (fp16) ← Half precision saves 50% │ │\n", + "│ │ 25.6 MB (int8) ← Quantization saves 75% │ │\n", + "│ └────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "│ WORD-LEVEL (Vocab: 100,000) │\n", + "│ ┌────────────────────────────────────────────────────────┐ │\n", + "│ │ 100K × 512 = 51.2M params │ │\n", + "│ │ ████████████████████████████████████████████████████ │ │\n", + "│ │ │ │\n", + "│ │ Memory: 204.8 MB (fp32) ← Often too large! │ │\n", + "│ │ 102.4 MB (fp16) │ │\n", + "│ └────────────────────────────────────────────────────────┘ │\n", + "│ │\n", + "│ Key Trade-off: │\n", + "│ Larger vocab → Shorter sequences → Less compute │\n", + "│ BUT larger vocab → More embedding memory → Harder to train │\n", + "│ │\n", + "└─────────────────────────────────────────────────────────────────────┘\n", + "\n", + "Real-World Production Examples:\n", + "┌─────────────┬──────────────┬───────────────┬──────────────────┐\n", + "│ Model │ Vocab Size │ Embed Dim │ Embed Memory │\n", + "├─────────────┼──────────────┼───────────────┼──────────────────┤\n", + "│ GPT-2 │ 50,257 │ 1,600 │ 321 MB │\n", + "│ GPT-3 │ 50,257 │ 12,288 │ 2.4 GB │\n", + "│ BERT │ 30,522 │ 768 │ 94 MB │\n", + "│ T5 │ 32,128 │ 512 │ 66 MB │\n", + "│ LLaMA-7B │ 32,000 │ 4,096 │ 524 MB │\n", + "└─────────────┴──────────────┴───────────────┴──────────────────┘\n", "```" ] }, { "cell_type": "markdown", - "id": "423df187", + "id": "a7c5816a", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1305,7 +1456,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6dceaa48", + "id": "2f3cfd32", "metadata": { "nbgrader": { "grade": true, @@ -1397,7 +1548,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8bb055b5", + "id": "9d68a974", "metadata": {}, "outputs": [], "source": [ @@ -1409,7 +1560,7 @@ }, { "cell_type": "markdown", - "id": "824eab53", + "id": "b7885211", "metadata": { "cell_marker": "\"\"\"" }, @@ -1441,7 +1592,7 @@ }, { "cell_type": "markdown", - "id": "3eab9125", + "id": "1c62fd5c", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/10_tokenization/tokenization_dev.py b/modules/source/10_tokenization/tokenization_dev.py index c06f2fec..443746fa 100644 --- a/modules/source/10_tokenization/tokenization_dev.py +++ b/modules/source/10_tokenization/tokenization_dev.py @@ -62,6 +62,7 @@ from tinytorch.text.tokenization import Tokenizer, CharTokenizer, BPETokenizer """ # %% +#| export import numpy as np from typing import List, Dict, Tuple, Optional, Set import json diff --git a/tinytorch/text/tokenization.py b/tinytorch/text/tokenization.py index 579bd63b..92801344 100644 --- a/tinytorch/text/tokenization.py +++ b/tinytorch/text/tokenization.py @@ -21,6 +21,16 @@ __all__ = ['Tokenizer', 'CharTokenizer', 'BPETokenizer'] #| default_exp text.tokenization #| export +# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 3 +import numpy as np +from typing import List, Dict, Tuple, Optional, Set +import json +import re +from collections import defaultdict, Counter + +# Import only Module 01 (Tensor) - this module has minimal dependencies +from ..core.tensor import Tensor + # %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 8 class Tokenizer: """