mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 02:02:21 -05:00
Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
455 lines
13 KiB
Python
Generated
455 lines
13 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/10_tokenization/tokenization_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['Tokenizer', 'CharTokenizer', 'BPETokenizer']
|
|
|
|
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 0
|
|
import numpy as np
|
|
from typing import List, Dict, Tuple, Optional, Set
|
|
import json
|
|
import re
|
|
from collections import defaultdict, Counter
|
|
|
|
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 8
|
|
class Tokenizer:
|
|
"""
|
|
Base tokenizer class providing the interface for all tokenizers.
|
|
|
|
This defines the contract that all tokenizers must follow:
|
|
- encode(): text → list of token IDs
|
|
- decode(): list of token IDs → text
|
|
"""
|
|
|
|
def encode(self, text: str) -> List[int]:
|
|
"""
|
|
Convert text to a list of token IDs.
|
|
|
|
TODO: Implement encoding logic in subclasses
|
|
|
|
APPROACH:
|
|
1. Subclasses will override this method
|
|
2. Return list of integer token IDs
|
|
|
|
EXAMPLE:
|
|
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
|
|
>>> tokenizer.encode("abc")
|
|
[0, 1, 2]
|
|
"""
|
|
### BEGIN SOLUTION
|
|
raise NotImplementedError("Subclasses must implement encode()")
|
|
### END SOLUTION
|
|
|
|
def decode(self, tokens: List[int]) -> str:
|
|
"""
|
|
Convert list of token IDs back to text.
|
|
|
|
TODO: Implement decoding logic in subclasses
|
|
|
|
APPROACH:
|
|
1. Subclasses will override this method
|
|
2. Return reconstructed text string
|
|
|
|
EXAMPLE:
|
|
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
|
|
>>> tokenizer.decode([0, 1, 2])
|
|
"abc"
|
|
"""
|
|
### BEGIN SOLUTION
|
|
raise NotImplementedError("Subclasses must implement decode()")
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 11
|
|
class CharTokenizer(Tokenizer):
|
|
"""
|
|
Character-level tokenizer that treats each character as a separate token.
|
|
|
|
This is the simplest tokenization approach - every character in the
|
|
vocabulary gets its own unique ID.
|
|
"""
|
|
|
|
def __init__(self, vocab: Optional[List[str]] = None):
|
|
"""
|
|
Initialize character tokenizer.
|
|
|
|
TODO: Set up vocabulary mappings
|
|
|
|
APPROACH:
|
|
1. Store vocabulary list
|
|
2. Create char→id and id→char mappings
|
|
3. Handle special tokens (unknown character)
|
|
|
|
EXAMPLE:
|
|
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
|
|
>>> tokenizer.vocab_size
|
|
4 # 3 chars + 1 unknown token
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if vocab is None:
|
|
vocab = []
|
|
|
|
# Add special unknown token
|
|
self.vocab = ['<UNK>'] + vocab
|
|
self.vocab_size = len(self.vocab)
|
|
|
|
# Create bidirectional mappings
|
|
self.char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
|
|
self.id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
|
|
|
|
# Store unknown token ID
|
|
self.unk_id = 0
|
|
### END SOLUTION
|
|
|
|
def build_vocab(self, corpus: List[str]) -> None:
|
|
"""
|
|
Build vocabulary from a corpus of text.
|
|
|
|
TODO: Extract unique characters and build vocabulary
|
|
|
|
APPROACH:
|
|
1. Collect all unique characters from corpus
|
|
2. Sort for consistent ordering
|
|
3. Rebuild mappings with new vocabulary
|
|
|
|
HINTS:
|
|
- Use set() to find unique characters
|
|
- Join all texts then convert to set
|
|
- Don't forget the <UNK> token
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Collect all unique characters
|
|
all_chars = set()
|
|
for text in corpus:
|
|
all_chars.update(text)
|
|
|
|
# Sort for consistent ordering
|
|
unique_chars = sorted(list(all_chars))
|
|
|
|
# Rebuild vocabulary with <UNK> token first
|
|
self.vocab = ['<UNK>'] + unique_chars
|
|
self.vocab_size = len(self.vocab)
|
|
|
|
# Rebuild mappings
|
|
self.char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
|
|
self.id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
|
|
### END SOLUTION
|
|
|
|
def encode(self, text: str) -> List[int]:
|
|
"""
|
|
Encode text to list of character IDs.
|
|
|
|
TODO: Convert each character to its vocabulary ID
|
|
|
|
APPROACH:
|
|
1. Iterate through each character in text
|
|
2. Look up character ID in vocabulary
|
|
3. Use unknown token ID for unseen characters
|
|
|
|
EXAMPLE:
|
|
>>> tokenizer = CharTokenizer(['h', 'e', 'l', 'o'])
|
|
>>> tokenizer.encode("hello")
|
|
[1, 2, 3, 3, 4] # maps to h,e,l,l,o
|
|
"""
|
|
### BEGIN SOLUTION
|
|
tokens = []
|
|
for char in text:
|
|
tokens.append(self.char_to_id.get(char, self.unk_id))
|
|
return tokens
|
|
### END SOLUTION
|
|
|
|
def decode(self, tokens: List[int]) -> str:
|
|
"""
|
|
Decode list of token IDs back to text.
|
|
|
|
TODO: Convert each token ID back to its character
|
|
|
|
APPROACH:
|
|
1. Look up each token ID in vocabulary
|
|
2. Join characters into string
|
|
3. Handle invalid token IDs gracefully
|
|
|
|
EXAMPLE:
|
|
>>> tokenizer = CharTokenizer(['h', 'e', 'l', 'o'])
|
|
>>> tokenizer.decode([1, 2, 3, 3, 4])
|
|
"hello"
|
|
"""
|
|
### BEGIN SOLUTION
|
|
chars = []
|
|
for token_id in tokens:
|
|
# Use unknown token for invalid IDs
|
|
char = self.id_to_char.get(token_id, '<UNK>')
|
|
chars.append(char)
|
|
return ''.join(chars)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 15
|
|
class BPETokenizer(Tokenizer):
|
|
"""
|
|
Byte Pair Encoding (BPE) tokenizer that learns subword units.
|
|
|
|
BPE works by:
|
|
1. Starting with character-level vocabulary
|
|
2. Finding most frequent character pairs
|
|
3. Merging frequent pairs into single tokens
|
|
4. Repeating until desired vocabulary size
|
|
"""
|
|
|
|
def __init__(self, vocab_size: int = 1000):
|
|
"""
|
|
Initialize BPE tokenizer.
|
|
|
|
TODO: Set up basic tokenizer state
|
|
|
|
APPROACH:
|
|
1. Store target vocabulary size
|
|
2. Initialize empty vocabulary and merge rules
|
|
3. Set up mappings for encoding/decoding
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.vocab_size = vocab_size
|
|
self.vocab = []
|
|
self.merges = [] # List of (pair, new_token) merges
|
|
self.token_to_id = {}
|
|
self.id_to_token = {}
|
|
### END SOLUTION
|
|
|
|
def _get_word_tokens(self, word: str) -> List[str]:
|
|
"""
|
|
Convert word to list of characters with end-of-word marker.
|
|
|
|
TODO: Tokenize word into character sequence
|
|
|
|
APPROACH:
|
|
1. Split word into characters
|
|
2. Add </w> marker to last character
|
|
3. Return list of tokens
|
|
|
|
EXAMPLE:
|
|
>>> tokenizer._get_word_tokens("hello")
|
|
['h', 'e', 'l', 'l', 'o</w>']
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if not word:
|
|
return []
|
|
|
|
tokens = list(word)
|
|
tokens[-1] += '</w>' # Mark end of word
|
|
return tokens
|
|
### END SOLUTION
|
|
|
|
def _get_pairs(self, word_tokens: List[str]) -> Set[Tuple[str, str]]:
|
|
"""
|
|
Get all adjacent pairs from word tokens.
|
|
|
|
TODO: Extract all consecutive character pairs
|
|
|
|
APPROACH:
|
|
1. Iterate through adjacent tokens
|
|
2. Create pairs of consecutive tokens
|
|
3. Return set of unique pairs
|
|
|
|
EXAMPLE:
|
|
>>> tokenizer._get_pairs(['h', 'e', 'l', 'l', 'o</w>'])
|
|
{('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o</w>')}
|
|
"""
|
|
### BEGIN SOLUTION
|
|
pairs = set()
|
|
for i in range(len(word_tokens) - 1):
|
|
pairs.add((word_tokens[i], word_tokens[i + 1]))
|
|
return pairs
|
|
### END SOLUTION
|
|
|
|
def train(self, corpus: List[str], vocab_size: int = None) -> None:
|
|
"""
|
|
Train BPE on corpus to learn merge rules.
|
|
|
|
TODO: Implement BPE training algorithm
|
|
|
|
APPROACH:
|
|
1. Build initial character vocabulary
|
|
2. Count word frequencies in corpus
|
|
3. Iteratively merge most frequent pairs
|
|
4. Build final vocabulary and mappings
|
|
|
|
HINTS:
|
|
- Start with character-level tokens
|
|
- Use frequency counts to guide merging
|
|
- Stop when vocabulary reaches target size
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if vocab_size:
|
|
self.vocab_size = vocab_size
|
|
|
|
# Count word frequencies
|
|
word_freq = Counter(corpus)
|
|
|
|
# Initialize vocabulary with characters
|
|
vocab = set()
|
|
word_tokens = {}
|
|
|
|
for word in word_freq:
|
|
tokens = self._get_word_tokens(word)
|
|
word_tokens[word] = tokens
|
|
vocab.update(tokens)
|
|
|
|
# Convert to sorted list for consistency
|
|
self.vocab = sorted(list(vocab))
|
|
|
|
# Add special tokens
|
|
if '<UNK>' not in self.vocab:
|
|
self.vocab = ['<UNK>'] + self.vocab
|
|
|
|
# Learn merges
|
|
self.merges = []
|
|
|
|
while len(self.vocab) < self.vocab_size:
|
|
# Count all pairs across all words
|
|
pair_counts = Counter()
|
|
|
|
for word, freq in word_freq.items():
|
|
tokens = word_tokens[word]
|
|
pairs = self._get_pairs(tokens)
|
|
for pair in pairs:
|
|
pair_counts[pair] += freq
|
|
|
|
if not pair_counts:
|
|
break
|
|
|
|
# Get most frequent pair
|
|
best_pair = pair_counts.most_common(1)[0][0]
|
|
|
|
# Merge this pair in all words
|
|
for word in word_tokens:
|
|
tokens = word_tokens[word]
|
|
new_tokens = []
|
|
i = 0
|
|
while i < len(tokens):
|
|
if (i < len(tokens) - 1 and
|
|
tokens[i] == best_pair[0] and
|
|
tokens[i + 1] == best_pair[1]):
|
|
# Merge pair
|
|
new_tokens.append(best_pair[0] + best_pair[1])
|
|
i += 2
|
|
else:
|
|
new_tokens.append(tokens[i])
|
|
i += 1
|
|
word_tokens[word] = new_tokens
|
|
|
|
# Add merged token to vocabulary
|
|
merged_token = best_pair[0] + best_pair[1]
|
|
self.vocab.append(merged_token)
|
|
self.merges.append(best_pair)
|
|
|
|
# Build final mappings
|
|
self._build_mappings()
|
|
### END SOLUTION
|
|
|
|
def _build_mappings(self):
|
|
"""Build token-to-ID and ID-to-token mappings."""
|
|
### BEGIN SOLUTION
|
|
self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
|
|
self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)}
|
|
### END SOLUTION
|
|
|
|
def _apply_merges(self, tokens: List[str]) -> List[str]:
|
|
"""
|
|
Apply learned merge rules to token sequence.
|
|
|
|
TODO: Apply BPE merges to token list
|
|
|
|
APPROACH:
|
|
1. Start with character-level tokens
|
|
2. Apply each merge rule in order
|
|
3. Continue until no more merges possible
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if not self.merges:
|
|
return tokens
|
|
|
|
for merge_pair in self.merges:
|
|
new_tokens = []
|
|
i = 0
|
|
while i < len(tokens):
|
|
if (i < len(tokens) - 1 and
|
|
tokens[i] == merge_pair[0] and
|
|
tokens[i + 1] == merge_pair[1]):
|
|
# Apply merge
|
|
new_tokens.append(merge_pair[0] + merge_pair[1])
|
|
i += 2
|
|
else:
|
|
new_tokens.append(tokens[i])
|
|
i += 1
|
|
tokens = new_tokens
|
|
|
|
return tokens
|
|
### END SOLUTION
|
|
|
|
def encode(self, text: str) -> List[int]:
|
|
"""
|
|
Encode text using BPE.
|
|
|
|
TODO: Apply BPE encoding to text
|
|
|
|
APPROACH:
|
|
1. Split text into words
|
|
2. Convert each word to character tokens
|
|
3. Apply BPE merges
|
|
4. Convert to token IDs
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if not self.vocab:
|
|
return []
|
|
|
|
# Simple word splitting (could be more sophisticated)
|
|
words = text.split()
|
|
all_tokens = []
|
|
|
|
for word in words:
|
|
# Get character-level tokens
|
|
word_tokens = self._get_word_tokens(word)
|
|
|
|
# Apply BPE merges
|
|
merged_tokens = self._apply_merges(word_tokens)
|
|
|
|
all_tokens.extend(merged_tokens)
|
|
|
|
# Convert to IDs
|
|
token_ids = []
|
|
for token in all_tokens:
|
|
token_ids.append(self.token_to_id.get(token, 0)) # 0 = <UNK>
|
|
|
|
return token_ids
|
|
### END SOLUTION
|
|
|
|
def decode(self, tokens: List[int]) -> str:
|
|
"""
|
|
Decode token IDs back to text.
|
|
|
|
TODO: Convert token IDs back to readable text
|
|
|
|
APPROACH:
|
|
1. Convert IDs to tokens
|
|
2. Join tokens together
|
|
3. Clean up word boundaries and markers
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if not self.id_to_token:
|
|
return ""
|
|
|
|
# Convert IDs to tokens
|
|
token_strings = []
|
|
for token_id in tokens:
|
|
token = self.id_to_token.get(token_id, '<UNK>')
|
|
token_strings.append(token)
|
|
|
|
# Join and clean up
|
|
text = ''.join(token_strings)
|
|
|
|
# Replace end-of-word markers with spaces
|
|
text = text.replace('</w>', ' ')
|
|
|
|
# Clean up extra spaces
|
|
text = ' '.join(text.split())
|
|
|
|
return text
|
|
### END SOLUTION
|