Files
TinyTorch/tinytorch/text/tokenization.py
Vijay Janapa Reddi 96880b3133 Update tinytorch and tito with module exports
Re-exported all modules after restructuring:
- Updated _modidx.py with new module locations
- Removed outdated autogeneration headers
- Updated all core modules (tensor, autograd, layers, etc.)
- Updated optimization modules (quantization, compression, etc.)
- Updated TITO commands for new structure

Changes include:
- 24 tinytorch/ module files
- 24 tito/ command and core files
- Updated references from modules/source/ to modules/

All modules re-exported via nbdev from their new locations.
2025-11-10 19:42:03 -05:00

455 lines
13 KiB
Python
Generated

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/10_tokenization/tokenization_dev.ipynb.
# %% auto 0
__all__ = ['Tokenizer', 'CharTokenizer', 'BPETokenizer']
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 0
import numpy as np
from typing import List, Dict, Tuple, Optional, Set
import json
import re
from collections import defaultdict, Counter
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 8
class Tokenizer:
"""
Base tokenizer class providing the interface for all tokenizers.
This defines the contract that all tokenizers must follow:
- encode(): text → list of token IDs
- decode(): list of token IDs → text
"""
def encode(self, text: str) -> List[int]:
"""
Convert text to a list of token IDs.
TODO: Implement encoding logic in subclasses
APPROACH:
1. Subclasses will override this method
2. Return list of integer token IDs
EXAMPLE:
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
>>> tokenizer.encode("abc")
[0, 1, 2]
"""
### BEGIN SOLUTION
raise NotImplementedError("Subclasses must implement encode()")
### END SOLUTION
def decode(self, tokens: List[int]) -> str:
"""
Convert list of token IDs back to text.
TODO: Implement decoding logic in subclasses
APPROACH:
1. Subclasses will override this method
2. Return reconstructed text string
EXAMPLE:
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
>>> tokenizer.decode([0, 1, 2])
"abc"
"""
### BEGIN SOLUTION
raise NotImplementedError("Subclasses must implement decode()")
### END SOLUTION
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 11
class CharTokenizer(Tokenizer):
"""
Character-level tokenizer that treats each character as a separate token.
This is the simplest tokenization approach - every character in the
vocabulary gets its own unique ID.
"""
def __init__(self, vocab: Optional[List[str]] = None):
"""
Initialize character tokenizer.
TODO: Set up vocabulary mappings
APPROACH:
1. Store vocabulary list
2. Create char→id and id→char mappings
3. Handle special tokens (unknown character)
EXAMPLE:
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
>>> tokenizer.vocab_size
4 # 3 chars + 1 unknown token
"""
### BEGIN SOLUTION
if vocab is None:
vocab = []
# Add special unknown token
self.vocab = ['<UNK>'] + vocab
self.vocab_size = len(self.vocab)
# Create bidirectional mappings
self.char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
self.id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
# Store unknown token ID
self.unk_id = 0
### END SOLUTION
def build_vocab(self, corpus: List[str]) -> None:
"""
Build vocabulary from a corpus of text.
TODO: Extract unique characters and build vocabulary
APPROACH:
1. Collect all unique characters from corpus
2. Sort for consistent ordering
3. Rebuild mappings with new vocabulary
HINTS:
- Use set() to find unique characters
- Join all texts then convert to set
- Don't forget the <UNK> token
"""
### BEGIN SOLUTION
# Collect all unique characters
all_chars = set()
for text in corpus:
all_chars.update(text)
# Sort for consistent ordering
unique_chars = sorted(list(all_chars))
# Rebuild vocabulary with <UNK> token first
self.vocab = ['<UNK>'] + unique_chars
self.vocab_size = len(self.vocab)
# Rebuild mappings
self.char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
self.id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
### END SOLUTION
def encode(self, text: str) -> List[int]:
"""
Encode text to list of character IDs.
TODO: Convert each character to its vocabulary ID
APPROACH:
1. Iterate through each character in text
2. Look up character ID in vocabulary
3. Use unknown token ID for unseen characters
EXAMPLE:
>>> tokenizer = CharTokenizer(['h', 'e', 'l', 'o'])
>>> tokenizer.encode("hello")
[1, 2, 3, 3, 4] # maps to h,e,l,l,o
"""
### BEGIN SOLUTION
tokens = []
for char in text:
tokens.append(self.char_to_id.get(char, self.unk_id))
return tokens
### END SOLUTION
def decode(self, tokens: List[int]) -> str:
"""
Decode list of token IDs back to text.
TODO: Convert each token ID back to its character
APPROACH:
1. Look up each token ID in vocabulary
2. Join characters into string
3. Handle invalid token IDs gracefully
EXAMPLE:
>>> tokenizer = CharTokenizer(['h', 'e', 'l', 'o'])
>>> tokenizer.decode([1, 2, 3, 3, 4])
"hello"
"""
### BEGIN SOLUTION
chars = []
for token_id in tokens:
# Use unknown token for invalid IDs
char = self.id_to_char.get(token_id, '<UNK>')
chars.append(char)
return ''.join(chars)
### END SOLUTION
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 15
class BPETokenizer(Tokenizer):
"""
Byte Pair Encoding (BPE) tokenizer that learns subword units.
BPE works by:
1. Starting with character-level vocabulary
2. Finding most frequent character pairs
3. Merging frequent pairs into single tokens
4. Repeating until desired vocabulary size
"""
def __init__(self, vocab_size: int = 1000):
"""
Initialize BPE tokenizer.
TODO: Set up basic tokenizer state
APPROACH:
1. Store target vocabulary size
2. Initialize empty vocabulary and merge rules
3. Set up mappings for encoding/decoding
"""
### BEGIN SOLUTION
self.vocab_size = vocab_size
self.vocab = []
self.merges = [] # List of (pair, new_token) merges
self.token_to_id = {}
self.id_to_token = {}
### END SOLUTION
def _get_word_tokens(self, word: str) -> List[str]:
"""
Convert word to list of characters with end-of-word marker.
TODO: Tokenize word into character sequence
APPROACH:
1. Split word into characters
2. Add </w> marker to last character
3. Return list of tokens
EXAMPLE:
>>> tokenizer._get_word_tokens("hello")
['h', 'e', 'l', 'l', 'o</w>']
"""
### BEGIN SOLUTION
if not word:
return []
tokens = list(word)
tokens[-1] += '</w>' # Mark end of word
return tokens
### END SOLUTION
def _get_pairs(self, word_tokens: List[str]) -> Set[Tuple[str, str]]:
"""
Get all adjacent pairs from word tokens.
TODO: Extract all consecutive character pairs
APPROACH:
1. Iterate through adjacent tokens
2. Create pairs of consecutive tokens
3. Return set of unique pairs
EXAMPLE:
>>> tokenizer._get_pairs(['h', 'e', 'l', 'l', 'o</w>'])
{('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o</w>')}
"""
### BEGIN SOLUTION
pairs = set()
for i in range(len(word_tokens) - 1):
pairs.add((word_tokens[i], word_tokens[i + 1]))
return pairs
### END SOLUTION
def train(self, corpus: List[str], vocab_size: int = None) -> None:
"""
Train BPE on corpus to learn merge rules.
TODO: Implement BPE training algorithm
APPROACH:
1. Build initial character vocabulary
2. Count word frequencies in corpus
3. Iteratively merge most frequent pairs
4. Build final vocabulary and mappings
HINTS:
- Start with character-level tokens
- Use frequency counts to guide merging
- Stop when vocabulary reaches target size
"""
### BEGIN SOLUTION
if vocab_size:
self.vocab_size = vocab_size
# Count word frequencies
word_freq = Counter(corpus)
# Initialize vocabulary with characters
vocab = set()
word_tokens = {}
for word in word_freq:
tokens = self._get_word_tokens(word)
word_tokens[word] = tokens
vocab.update(tokens)
# Convert to sorted list for consistency
self.vocab = sorted(list(vocab))
# Add special tokens
if '<UNK>' not in self.vocab:
self.vocab = ['<UNK>'] + self.vocab
# Learn merges
self.merges = []
while len(self.vocab) < self.vocab_size:
# Count all pairs across all words
pair_counts = Counter()
for word, freq in word_freq.items():
tokens = word_tokens[word]
pairs = self._get_pairs(tokens)
for pair in pairs:
pair_counts[pair] += freq
if not pair_counts:
break
# Get most frequent pair
best_pair = pair_counts.most_common(1)[0][0]
# Merge this pair in all words
for word in word_tokens:
tokens = word_tokens[word]
new_tokens = []
i = 0
while i < len(tokens):
if (i < len(tokens) - 1 and
tokens[i] == best_pair[0] and
tokens[i + 1] == best_pair[1]):
# Merge pair
new_tokens.append(best_pair[0] + best_pair[1])
i += 2
else:
new_tokens.append(tokens[i])
i += 1
word_tokens[word] = new_tokens
# Add merged token to vocabulary
merged_token = best_pair[0] + best_pair[1]
self.vocab.append(merged_token)
self.merges.append(best_pair)
# Build final mappings
self._build_mappings()
### END SOLUTION
def _build_mappings(self):
"""Build token-to-ID and ID-to-token mappings."""
### BEGIN SOLUTION
self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)}
### END SOLUTION
def _apply_merges(self, tokens: List[str]) -> List[str]:
"""
Apply learned merge rules to token sequence.
TODO: Apply BPE merges to token list
APPROACH:
1. Start with character-level tokens
2. Apply each merge rule in order
3. Continue until no more merges possible
"""
### BEGIN SOLUTION
if not self.merges:
return tokens
for merge_pair in self.merges:
new_tokens = []
i = 0
while i < len(tokens):
if (i < len(tokens) - 1 and
tokens[i] == merge_pair[0] and
tokens[i + 1] == merge_pair[1]):
# Apply merge
new_tokens.append(merge_pair[0] + merge_pair[1])
i += 2
else:
new_tokens.append(tokens[i])
i += 1
tokens = new_tokens
return tokens
### END SOLUTION
def encode(self, text: str) -> List[int]:
"""
Encode text using BPE.
TODO: Apply BPE encoding to text
APPROACH:
1. Split text into words
2. Convert each word to character tokens
3. Apply BPE merges
4. Convert to token IDs
"""
### BEGIN SOLUTION
if not self.vocab:
return []
# Simple word splitting (could be more sophisticated)
words = text.split()
all_tokens = []
for word in words:
# Get character-level tokens
word_tokens = self._get_word_tokens(word)
# Apply BPE merges
merged_tokens = self._apply_merges(word_tokens)
all_tokens.extend(merged_tokens)
# Convert to IDs
token_ids = []
for token in all_tokens:
token_ids.append(self.token_to_id.get(token, 0)) # 0 = <UNK>
return token_ids
### END SOLUTION
def decode(self, tokens: List[int]) -> str:
"""
Decode token IDs back to text.
TODO: Convert token IDs back to readable text
APPROACH:
1. Convert IDs to tokens
2. Join tokens together
3. Clean up word boundaries and markers
"""
### BEGIN SOLUTION
if not self.id_to_token:
return ""
# Convert IDs to tokens
token_strings = []
for token_id in tokens:
token = self.id_to_token.get(token_id, '<UNK>')
token_strings.append(token)
# Join and clean up
text = ''.join(token_strings)
# Replace end-of-word markers with spaces
text = text.replace('</w>', ' ')
# Clean up extra spaces
text = ' '.join(text.split())
return text
### END SOLUTION