Files
TinyTorch/tinyGPT/core/tokenizer.py
Vijay Janapa Reddi 5386b58e07 Implement interactive ML Systems questions and standardize module structure
Major Educational Framework Enhancements:
• Deploy interactive NBGrader text response questions across ALL modules
• Replace passive question lists with active 150-300 word student responses
• Enable comprehensive ML Systems learning assessment and grading

TinyGPT Integration (Module 16):
• Complete TinyGPT implementation showing 70% component reuse from TinyTorch
• Demonstrates vision-to-language framework generalization principles
• Full transformer architecture with attention, tokenization, and generation
• Shakespeare demo showing autoregressive text generation capabilities

Module Structure Standardization:
• Fix section ordering across all modules: Tests → Questions → Summary
• Ensure Module Summary is always the final section for consistency
• Standardize comprehensive testing patterns before educational content

Interactive Question Implementation:
• 3 focused questions per module replacing 10-15 passive questions
• NBGrader integration with manual grading workflow for text responses
• Questions target ML Systems thinking: scaling, deployment, optimization
• Cumulative knowledge building across the 16-module progression

Technical Infrastructure:
• TPM agent for coordinated multi-agent development workflows
• Enhanced documentation with pedagogical design principles
• Updated book structure to include TinyGPT as capstone demonstration
• Comprehensive QA validation of all module structures

Framework Design Insights:
• Mathematical unity: Dense layers power both vision and language models
• Attention as key innovation for sequential relationship modeling
• Production-ready patterns: training loops, optimization, evaluation
• System-level thinking: memory, performance, scaling considerations

Educational Impact:
• Transform passive learning to active engagement through written responses
• Enable instructors to assess deep ML Systems understanding
• Provide clear progression from foundations to complete language models
• Demonstrate real-world framework design principles and trade-offs
2025-09-17 14:42:24 -04:00

477 lines
16 KiB
Python

"""
Character-level tokenizer for TinyGPT language models.
Implements character-level tokenization for use with TinyGPT transformer models.
This tokenizer converts text to sequences of character tokens and back.
"""
import numpy as np
from typing import List, Optional, Dict, Union
class CharTokenizer:
"""Character-level tokenizer for language models.
This tokenizer treats each character as a separate token, making it simple
but effective for learning character-level patterns in text. It's ideal for
educational purposes and small-scale language modeling experiments.
The tokenizer builds a vocabulary from the training text and provides
methods for encoding text to token indices and decoding back to text.
Educational Benefits:
- Simple and transparent tokenization strategy
- No complex subword algorithms to understand
- Direct character-to-token mapping
- Easy to debug and visualize
"""
def __init__(self, vocab_size: Optional[int] = None,
special_tokens: Optional[List[str]] = None):
"""Initialize character tokenizer.
Args:
vocab_size: Maximum vocabulary size (None = unlimited)
special_tokens: List of special tokens to include (e.g., ['<UNK>', '<PAD>'])
Educational Note:
vocab_size limiting is important for computational efficiency.
Special tokens handle edge cases like unknown characters.
"""
self.vocab_size = vocab_size
self.special_tokens = special_tokens or ['<UNK>', '<PAD>']
# Core vocabulary mappings
self.char_to_idx: Dict[str, int] = {}
self.idx_to_char: Dict[int, str] = {}
# Special token indices
self.unk_token = '<UNK>'
self.pad_token = '<PAD>'
self.unk_idx = 0 # Will be set in fit()
self.pad_idx = 1 # Will be set in fit()
# State tracking
self.is_fitted = False
self.character_counts: Dict[str, int] = {}
print(f"🔤 CharTokenizer initialized:")
print(f" Max vocab size: {vocab_size or 'unlimited'}")
print(f" Special tokens: {self.special_tokens}")
def fit(self, text: str) -> None:
"""Build vocabulary from training text.
Args:
text: Training text to build vocabulary from
Educational Process:
1. Count character frequencies in the text
2. Add special tokens first (ensures consistent indices)
3. Add most frequent characters up to vocab_size limit
4. Create bidirectional mappings for fast lookup
"""
if not text:
raise ValueError("Cannot fit tokenizer on empty text")
print(f"🔍 Analyzing text for vocabulary...")
print(f" Text length: {len(text):,} characters")
# Count character frequencies
self.character_counts = {}
for char in text:
self.character_counts[char] = self.character_counts.get(char, 0) + 1
unique_chars = len(self.character_counts)
print(f" Unique characters found: {unique_chars}")
# Start building vocabulary with special tokens
self.char_to_idx = {}
self.idx_to_char = {}
# Add special tokens first (ensures consistent indices)
for i, token in enumerate(self.special_tokens):
self.char_to_idx[token] = i
self.idx_to_char[i] = token
# Set special token indices
self.unk_idx = self.char_to_idx[self.unk_token]
self.pad_idx = self.char_to_idx[self.pad_token]
# Sort characters by frequency (most frequent first)
sorted_chars = sorted(self.character_counts.items(),
key=lambda x: x[1], reverse=True)
# Add characters to vocabulary up to limit
current_idx = len(self.special_tokens)
chars_added = 0
for char, count in sorted_chars:
# Skip if already in vocabulary (shouldn't happen with char-level)
if char in self.char_to_idx:
continue
# Check vocab size limit
if self.vocab_size and current_idx >= self.vocab_size:
break
self.char_to_idx[char] = current_idx
self.idx_to_char[current_idx] = char
current_idx += 1
chars_added += 1
self.is_fitted = True
print(f"✅ Vocabulary built successfully:")
print(f" Final vocab size: {len(self.char_to_idx)}")
print(f" Characters included: {chars_added}")
if self.vocab_size and chars_added < unique_chars:
excluded = unique_chars - chars_added
print(f" Characters excluded: {excluded} (will map to <UNK>)")
# Show most frequent characters
print(f" Most frequent: {sorted_chars[:10]}")
def encode(self, text: str) -> List[int]:
"""Convert text to sequence of token indices.
Args:
text: Text to encode
Returns:
List of token indices
Educational Note:
Characters not in vocabulary are mapped to <UNK> token.
This handles rare characters and maintains fixed vocabulary size.
"""
if not self.is_fitted:
raise RuntimeError("Tokenizer must be fitted before encoding")
if not text:
return []
indices = []
unk_count = 0
for char in text:
if char in self.char_to_idx:
indices.append(self.char_to_idx[char])
else:
indices.append(self.unk_idx)
unk_count += 1
if unk_count > 0:
unk_rate = unk_count / len(text) * 100
print(f"⚠️ Encoding: {unk_count} unknown chars ({unk_rate:.1f}%)")
return indices
def decode(self, indices: List[int]) -> str:
"""Convert sequence of token indices back to text.
Args:
indices: List of token indices to decode
Returns:
Decoded text string
Educational Note:
Invalid indices are skipped to handle generation errors gracefully.
"""
if not self.is_fitted:
raise RuntimeError("Tokenizer must be fitted before decoding")
if not indices:
return ""
chars = []
invalid_count = 0
for idx in indices:
if idx in self.idx_to_char:
char = self.idx_to_char[idx]
# Skip special tokens in decoded output (except space-like chars)
if char not in [self.pad_token]: # Keep <UNK> for debugging
chars.append(char)
else:
invalid_count += 1
if invalid_count > 0:
print(f"⚠️ Decoding: {invalid_count} invalid indices skipped")
return ''.join(chars)
def get_vocab_size(self) -> int:
"""Get the current vocabulary size.
Returns:
Number of tokens in vocabulary
"""
return len(self.char_to_idx)
def encode_batch(self, texts: List[str], max_length: Optional[int] = None,
padding: bool = True, truncation: bool = True) -> np.ndarray:
"""Encode batch of texts with optional padding and truncation.
Args:
texts: List of texts to encode
max_length: Maximum sequence length (None = longest in batch)
padding: Whether to pad sequences to max_length
truncation: Whether to truncate sequences to max_length
Returns:
2D numpy array of shape (batch_size, max_length)
Educational Benefits:
- Demonstrates batch processing for efficiency
- Shows padding/truncation strategies for variable length sequences
- Prepares data in format expected by neural networks
"""
if not self.is_fitted:
raise RuntimeError("Tokenizer must be fitted before encoding")
if not texts:
return np.array([])
# Encode all texts
encoded_texts = [self.encode(text) for text in texts]
# Determine max length
if max_length is None:
max_length = max(len(encoded) for encoded in encoded_texts)
# Prepare batch array
batch_size = len(texts)
batch_array = np.full((batch_size, max_length), self.pad_idx, dtype=np.int32)
# Fill batch array
for i, encoded in enumerate(encoded_texts):
if truncation and len(encoded) > max_length:
# Truncate from the end
sequence = encoded[:max_length]
else:
sequence = encoded
# Copy sequence into batch array
seq_len = min(len(sequence), max_length)
batch_array[i, :seq_len] = sequence[:seq_len]
return batch_array
def get_vocabulary(self) -> Dict[str, int]:
"""Get the complete vocabulary mapping.
Returns:
Dictionary mapping characters to indices
"""
return self.char_to_idx.copy()
def get_special_tokens(self) -> Dict[str, int]:
"""Get special token mappings.
Returns:
Dictionary mapping special tokens to indices
"""
return {token: self.char_to_idx[token] for token in self.special_tokens}
def analyze_text(self, text: str) -> Dict[str, Union[int, float]]:
"""Analyze text with current vocabulary.
Args:
text: Text to analyze
Returns:
Dictionary with analysis statistics
Educational Purpose:
Helps understand vocabulary coverage and tokenization quality.
"""
if not self.is_fitted:
raise RuntimeError("Tokenizer must be fitted before analysis")
if not text:
return {'length': 0, 'tokens': 0, 'coverage': 0.0, 'unk_rate': 0.0}
indices = self.encode(text)
unk_count = sum(1 for idx in indices if idx == self.unk_idx)
stats = {
'length': len(text),
'tokens': len(indices),
'unique_chars': len(set(text)),
'vocab_coverage': len(set(text) & set(self.char_to_idx.keys())),
'unk_count': unk_count,
'unk_rate': unk_count / len(indices) * 100 if indices else 0.0,
'compression_ratio': len(text) / len(indices) if indices else 0.0
}
return stats
def save_vocabulary(self, filepath: str) -> None:
"""Save vocabulary to file for reuse.
Args:
filepath: Path to save vocabulary file
Educational Note:
In production, you'd want to save/load vocabularies to ensure
consistency across training and inference.
"""
import json
if not self.is_fitted:
raise RuntimeError("Cannot save unfitted tokenizer")
vocab_data = {
'char_to_idx': self.char_to_idx,
'special_tokens': self.special_tokens,
'vocab_size': self.vocab_size,
'character_counts': self.character_counts
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(vocab_data, f, ensure_ascii=False, indent=2)
print(f"💾 Vocabulary saved to {filepath}")
def load_vocabulary(self, filepath: str) -> None:
"""Load vocabulary from file.
Args:
filepath: Path to vocabulary file
"""
import json
with open(filepath, 'r', encoding='utf-8') as f:
vocab_data = json.load(f)
self.char_to_idx = vocab_data['char_to_idx']
self.special_tokens = vocab_data['special_tokens']
self.vocab_size = vocab_data['vocab_size']
self.character_counts = vocab_data['character_counts']
# Rebuild reverse mapping
self.idx_to_char = {int(idx): char for char, idx in self.char_to_idx.items()}
# Set special token indices
self.unk_idx = self.char_to_idx[self.unk_token]
self.pad_idx = self.char_to_idx[self.pad_token]
self.is_fitted = True
print(f"📁 Vocabulary loaded from {filepath}")
print(f" Vocab size: {len(self.char_to_idx)}")
if __name__ == "__main__":
# Test the CharTokenizer
print("🧪 Testing CharTokenizer")
print("=" * 50)
# Sample text for testing
sample_text = """To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And by opposing end them."""
print(f"📝 Sample text ({len(sample_text)} chars):")
print(f"'{sample_text[:100]}...'")
print()
# Test basic tokenization
print("🔤 Basic Tokenization Test:")
tokenizer = CharTokenizer(vocab_size=50)
tokenizer.fit(sample_text)
print()
# Test encoding/decoding
test_phrase = "To be or not to be"
print(f"🔬 Encoding/Decoding Test:")
print(f"Original: '{test_phrase}'")
encoded = tokenizer.encode(test_phrase)
print(f"Encoded: {encoded}")
decoded = tokenizer.decode(encoded)
print(f"Decoded: '{decoded}'")
print(f"Round-trip successful: {test_phrase == decoded}")
print()
# Test batch encoding
print("📦 Batch Encoding Test:")
batch_texts = [
"To be",
"or not to be",
"that is the question"
]
batch_encoded = tokenizer.encode_batch(batch_texts, max_length=20)
print(f"Batch shape: {batch_encoded.shape}")
print(f"Batch sample:\n{batch_encoded}")
print()
# Test vocabulary analysis
print("📊 Vocabulary Analysis:")
vocab = tokenizer.get_vocabulary()
special_tokens = tokenizer.get_special_tokens()
print(f"Total vocabulary size: {len(vocab)}")
print(f"Special tokens: {special_tokens}")
print(f"Sample characters: {list(vocab.keys())[:20]}")
print()
# Test text analysis
print("🔍 Text Analysis:")
stats = tokenizer.analyze_text(sample_text)
for key, value in stats.items():
if isinstance(value, float):
print(f" {key}: {value:.2f}")
else:
print(f" {key}: {value}")
print()
# Test with limited vocabulary
print("⚠️ Limited Vocabulary Test:")
small_tokenizer = CharTokenizer(vocab_size=10) # Very small vocab
small_tokenizer.fit("abcdefghijklmnopqrstuvwxyz")
test_text = "Hello, World!"
encoded_small = small_tokenizer.encode(test_text)
decoded_small = small_tokenizer.decode(encoded_small)
print(f"Original: '{test_text}'")
print(f"Decoded: '{decoded_small}'")
print(f"Small vocab size: {small_tokenizer.get_vocab_size()}")
print()
# Performance characteristics
print("⚡ Performance Characteristics:")
import time
# Encoding speed test
long_text = sample_text * 100 # Make it longer
start_time = time.time()
encoded_long = tokenizer.encode(long_text)
encoding_time = time.time() - start_time
# Decoding speed test
start_time = time.time()
decoded_long = tokenizer.decode(encoded_long)
decoding_time = time.time() - start_time
print(f"Text length: {len(long_text):,} chars")
print(f"Encoding time: {encoding_time:.4f}s ({len(long_text)/encoding_time:.0f} chars/s)")
print(f"Decoding time: {decoding_time:.4f}s ({len(encoded_long)/decoding_time:.0f} tokens/s)")
print()
print("✅ CharTokenizer tests completed!")
print("\n💡 Key insights:")
print(" • Character-level tokenization is simple and transparent")
print(" • Vocabulary size affects memory usage and unknown token rate")
print(" • Batch processing enables efficient neural network training")
print(" • Special tokens handle edge cases gracefully")
print(" • Round-trip encoding/decoding preserves text (when vocab is sufficient)")
print(" • 🎉 Ready for integration with TinyGPT!")