# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/10_tokenization/tokenization_dev.ipynb. # %% auto 0 __all__ = ['Tokenizer', 'CharTokenizer', 'BPETokenizer'] # %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 0 import numpy as np from typing import List, Dict, Tuple, Optional, Set import json import re from collections import defaultdict, Counter # %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 8 class Tokenizer: """ Base tokenizer class providing the interface for all tokenizers. This defines the contract that all tokenizers must follow: - encode(): text → list of token IDs - decode(): list of token IDs → text """ def encode(self, text: str) -> List[int]: """ Convert text to a list of token IDs. TODO: Implement encoding logic in subclasses APPROACH: 1. Subclasses will override this method 2. Return list of integer token IDs EXAMPLE: >>> tokenizer = CharTokenizer(['a', 'b', 'c']) >>> tokenizer.encode("abc") [0, 1, 2] """ ### BEGIN SOLUTION raise NotImplementedError("Subclasses must implement encode()") ### END SOLUTION def decode(self, tokens: List[int]) -> str: """ Convert list of token IDs back to text. TODO: Implement decoding logic in subclasses APPROACH: 1. Subclasses will override this method 2. Return reconstructed text string EXAMPLE: >>> tokenizer = CharTokenizer(['a', 'b', 'c']) >>> tokenizer.decode([0, 1, 2]) "abc" """ ### BEGIN SOLUTION raise NotImplementedError("Subclasses must implement decode()") ### END SOLUTION # %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 11 class CharTokenizer(Tokenizer): """ Character-level tokenizer that treats each character as a separate token. This is the simplest tokenization approach - every character in the vocabulary gets its own unique ID. """ def __init__(self, vocab: Optional[List[str]] = None): """ Initialize character tokenizer. TODO: Set up vocabulary mappings APPROACH: 1. Store vocabulary list 2. Create char→id and id→char mappings 3. Handle special tokens (unknown character) EXAMPLE: >>> tokenizer = CharTokenizer(['a', 'b', 'c']) >>> tokenizer.vocab_size 4 # 3 chars + 1 unknown token """ ### BEGIN SOLUTION if vocab is None: vocab = [] # Add special unknown token self.vocab = [''] + vocab self.vocab_size = len(self.vocab) # Create bidirectional mappings self.char_to_id = {char: idx for idx, char in enumerate(self.vocab)} self.id_to_char = {idx: char for idx, char in enumerate(self.vocab)} # Store unknown token ID self.unk_id = 0 ### END SOLUTION def build_vocab(self, corpus: List[str]) -> None: """ Build vocabulary from a corpus of text. TODO: Extract unique characters and build vocabulary APPROACH: 1. Collect all unique characters from corpus 2. Sort for consistent ordering 3. Rebuild mappings with new vocabulary HINTS: - Use set() to find unique characters - Join all texts then convert to set - Don't forget the token """ ### BEGIN SOLUTION # Collect all unique characters all_chars = set() for text in corpus: all_chars.update(text) # Sort for consistent ordering unique_chars = sorted(list(all_chars)) # Rebuild vocabulary with token first self.vocab = [''] + unique_chars self.vocab_size = len(self.vocab) # Rebuild mappings self.char_to_id = {char: idx for idx, char in enumerate(self.vocab)} self.id_to_char = {idx: char for idx, char in enumerate(self.vocab)} ### END SOLUTION def encode(self, text: str) -> List[int]: """ Encode text to list of character IDs. TODO: Convert each character to its vocabulary ID APPROACH: 1. Iterate through each character in text 2. Look up character ID in vocabulary 3. Use unknown token ID for unseen characters EXAMPLE: >>> tokenizer = CharTokenizer(['h', 'e', 'l', 'o']) >>> tokenizer.encode("hello") [1, 2, 3, 3, 4] # maps to h,e,l,l,o """ ### BEGIN SOLUTION tokens = [] for char in text: tokens.append(self.char_to_id.get(char, self.unk_id)) return tokens ### END SOLUTION def decode(self, tokens: List[int]) -> str: """ Decode list of token IDs back to text. TODO: Convert each token ID back to its character APPROACH: 1. Look up each token ID in vocabulary 2. Join characters into string 3. Handle invalid token IDs gracefully EXAMPLE: >>> tokenizer = CharTokenizer(['h', 'e', 'l', 'o']) >>> tokenizer.decode([1, 2, 3, 3, 4]) "hello" """ ### BEGIN SOLUTION chars = [] for token_id in tokens: # Use unknown token for invalid IDs char = self.id_to_char.get(token_id, '') chars.append(char) return ''.join(chars) ### END SOLUTION # %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 15 class BPETokenizer(Tokenizer): """ Byte Pair Encoding (BPE) tokenizer that learns subword units. BPE works by: 1. Starting with character-level vocabulary 2. Finding most frequent character pairs 3. Merging frequent pairs into single tokens 4. Repeating until desired vocabulary size """ def __init__(self, vocab_size: int = 1000): """ Initialize BPE tokenizer. TODO: Set up basic tokenizer state APPROACH: 1. Store target vocabulary size 2. Initialize empty vocabulary and merge rules 3. Set up mappings for encoding/decoding """ ### BEGIN SOLUTION self.vocab_size = vocab_size self.vocab = [] self.merges = [] # List of (pair, new_token) merges self.token_to_id = {} self.id_to_token = {} ### END SOLUTION def _get_word_tokens(self, word: str) -> List[str]: """ Convert word to list of characters with end-of-word marker. TODO: Tokenize word into character sequence APPROACH: 1. Split word into characters 2. Add marker to last character 3. Return list of tokens EXAMPLE: >>> tokenizer._get_word_tokens("hello") ['h', 'e', 'l', 'l', 'o'] """ ### BEGIN SOLUTION if not word: return [] tokens = list(word) tokens[-1] += '' # Mark end of word return tokens ### END SOLUTION def _get_pairs(self, word_tokens: List[str]) -> Set[Tuple[str, str]]: """ Get all adjacent pairs from word tokens. TODO: Extract all consecutive character pairs APPROACH: 1. Iterate through adjacent tokens 2. Create pairs of consecutive tokens 3. Return set of unique pairs EXAMPLE: >>> tokenizer._get_pairs(['h', 'e', 'l', 'l', 'o']) {('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o')} """ ### BEGIN SOLUTION pairs = set() for i in range(len(word_tokens) - 1): pairs.add((word_tokens[i], word_tokens[i + 1])) return pairs ### END SOLUTION def train(self, corpus: List[str], vocab_size: int = None) -> None: """ Train BPE on corpus to learn merge rules. TODO: Implement BPE training algorithm APPROACH: 1. Build initial character vocabulary 2. Count word frequencies in corpus 3. Iteratively merge most frequent pairs 4. Build final vocabulary and mappings HINTS: - Start with character-level tokens - Use frequency counts to guide merging - Stop when vocabulary reaches target size """ ### BEGIN SOLUTION if vocab_size: self.vocab_size = vocab_size # Count word frequencies word_freq = Counter(corpus) # Initialize vocabulary with characters vocab = set() word_tokens = {} for word in word_freq: tokens = self._get_word_tokens(word) word_tokens[word] = tokens vocab.update(tokens) # Convert to sorted list for consistency self.vocab = sorted(list(vocab)) # Add special tokens if '' not in self.vocab: self.vocab = [''] + self.vocab # Learn merges self.merges = [] while len(self.vocab) < self.vocab_size: # Count all pairs across all words pair_counts = Counter() for word, freq in word_freq.items(): tokens = word_tokens[word] pairs = self._get_pairs(tokens) for pair in pairs: pair_counts[pair] += freq if not pair_counts: break # Get most frequent pair best_pair = pair_counts.most_common(1)[0][0] # Merge this pair in all words for word in word_tokens: tokens = word_tokens[word] new_tokens = [] i = 0 while i < len(tokens): if (i < len(tokens) - 1 and tokens[i] == best_pair[0] and tokens[i + 1] == best_pair[1]): # Merge pair new_tokens.append(best_pair[0] + best_pair[1]) i += 2 else: new_tokens.append(tokens[i]) i += 1 word_tokens[word] = new_tokens # Add merged token to vocabulary merged_token = best_pair[0] + best_pair[1] self.vocab.append(merged_token) self.merges.append(best_pair) # Build final mappings self._build_mappings() ### END SOLUTION def _build_mappings(self): """Build token-to-ID and ID-to-token mappings.""" ### BEGIN SOLUTION self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)} self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)} ### END SOLUTION def _apply_merges(self, tokens: List[str]) -> List[str]: """ Apply learned merge rules to token sequence. TODO: Apply BPE merges to token list APPROACH: 1. Start with character-level tokens 2. Apply each merge rule in order 3. Continue until no more merges possible """ ### BEGIN SOLUTION if not self.merges: return tokens for merge_pair in self.merges: new_tokens = [] i = 0 while i < len(tokens): if (i < len(tokens) - 1 and tokens[i] == merge_pair[0] and tokens[i + 1] == merge_pair[1]): # Apply merge new_tokens.append(merge_pair[0] + merge_pair[1]) i += 2 else: new_tokens.append(tokens[i]) i += 1 tokens = new_tokens return tokens ### END SOLUTION def encode(self, text: str) -> List[int]: """ Encode text using BPE. TODO: Apply BPE encoding to text APPROACH: 1. Split text into words 2. Convert each word to character tokens 3. Apply BPE merges 4. Convert to token IDs """ ### BEGIN SOLUTION if not self.vocab: return [] # Simple word splitting (could be more sophisticated) words = text.split() all_tokens = [] for word in words: # Get character-level tokens word_tokens = self._get_word_tokens(word) # Apply BPE merges merged_tokens = self._apply_merges(word_tokens) all_tokens.extend(merged_tokens) # Convert to IDs token_ids = [] for token in all_tokens: token_ids.append(self.token_to_id.get(token, 0)) # 0 = return token_ids ### END SOLUTION def decode(self, tokens: List[int]) -> str: """ Decode token IDs back to text. TODO: Convert token IDs back to readable text APPROACH: 1. Convert IDs to tokens 2. Join tokens together 3. Clean up word boundaries and markers """ ### BEGIN SOLUTION if not self.id_to_token: return "" # Convert IDs to tokens token_strings = [] for token_id in tokens: token = self.id_to_token.get(token_id, '') token_strings.append(token) # Join and clean up text = ''.join(token_strings) # Replace end-of-word markers with spaces text = text.replace('', ' ') # Clean up extra spaces text = ' '.join(text.split()) return text ### END SOLUTION