mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-02 08:32:31 -05:00
Major Educational Framework Enhancements: • Deploy interactive NBGrader text response questions across ALL modules • Replace passive question lists with active 150-300 word student responses • Enable comprehensive ML Systems learning assessment and grading TinyGPT Integration (Module 16): • Complete TinyGPT implementation showing 70% component reuse from TinyTorch • Demonstrates vision-to-language framework generalization principles • Full transformer architecture with attention, tokenization, and generation • Shakespeare demo showing autoregressive text generation capabilities Module Structure Standardization: • Fix section ordering across all modules: Tests → Questions → Summary • Ensure Module Summary is always the final section for consistency • Standardize comprehensive testing patterns before educational content Interactive Question Implementation: • 3 focused questions per module replacing 10-15 passive questions • NBGrader integration with manual grading workflow for text responses • Questions target ML Systems thinking: scaling, deployment, optimization • Cumulative knowledge building across the 16-module progression Technical Infrastructure: • TPM agent for coordinated multi-agent development workflows • Enhanced documentation with pedagogical design principles • Updated book structure to include TinyGPT as capstone demonstration • Comprehensive QA validation of all module structures Framework Design Insights: • Mathematical unity: Dense layers power both vision and language models • Attention as key innovation for sequential relationship modeling • Production-ready patterns: training loops, optimization, evaluation • System-level thinking: memory, performance, scaling considerations Educational Impact: • Transform passive learning to active engagement through written responses • Enable instructors to assess deep ML Systems understanding • Provide clear progression from foundations to complete language models • Demonstrate real-world framework design principles and trade-offs
477 lines
16 KiB
Python
477 lines
16 KiB
Python
"""
|
|
Character-level tokenizer for TinyGPT language models.
|
|
|
|
Implements character-level tokenization for use with TinyGPT transformer models.
|
|
This tokenizer converts text to sequences of character tokens and back.
|
|
"""
|
|
|
|
import numpy as np
|
|
from typing import List, Optional, Dict, Union
|
|
|
|
|
|
class CharTokenizer:
|
|
"""Character-level tokenizer for language models.
|
|
|
|
This tokenizer treats each character as a separate token, making it simple
|
|
but effective for learning character-level patterns in text. It's ideal for
|
|
educational purposes and small-scale language modeling experiments.
|
|
|
|
The tokenizer builds a vocabulary from the training text and provides
|
|
methods for encoding text to token indices and decoding back to text.
|
|
|
|
Educational Benefits:
|
|
- Simple and transparent tokenization strategy
|
|
- No complex subword algorithms to understand
|
|
- Direct character-to-token mapping
|
|
- Easy to debug and visualize
|
|
"""
|
|
|
|
def __init__(self, vocab_size: Optional[int] = None,
|
|
special_tokens: Optional[List[str]] = None):
|
|
"""Initialize character tokenizer.
|
|
|
|
Args:
|
|
vocab_size: Maximum vocabulary size (None = unlimited)
|
|
special_tokens: List of special tokens to include (e.g., ['<UNK>', '<PAD>'])
|
|
|
|
Educational Note:
|
|
vocab_size limiting is important for computational efficiency.
|
|
Special tokens handle edge cases like unknown characters.
|
|
"""
|
|
self.vocab_size = vocab_size
|
|
self.special_tokens = special_tokens or ['<UNK>', '<PAD>']
|
|
|
|
# Core vocabulary mappings
|
|
self.char_to_idx: Dict[str, int] = {}
|
|
self.idx_to_char: Dict[int, str] = {}
|
|
|
|
# Special token indices
|
|
self.unk_token = '<UNK>'
|
|
self.pad_token = '<PAD>'
|
|
self.unk_idx = 0 # Will be set in fit()
|
|
self.pad_idx = 1 # Will be set in fit()
|
|
|
|
# State tracking
|
|
self.is_fitted = False
|
|
self.character_counts: Dict[str, int] = {}
|
|
|
|
print(f"🔤 CharTokenizer initialized:")
|
|
print(f" Max vocab size: {vocab_size or 'unlimited'}")
|
|
print(f" Special tokens: {self.special_tokens}")
|
|
|
|
def fit(self, text: str) -> None:
|
|
"""Build vocabulary from training text.
|
|
|
|
Args:
|
|
text: Training text to build vocabulary from
|
|
|
|
Educational Process:
|
|
1. Count character frequencies in the text
|
|
2. Add special tokens first (ensures consistent indices)
|
|
3. Add most frequent characters up to vocab_size limit
|
|
4. Create bidirectional mappings for fast lookup
|
|
"""
|
|
if not text:
|
|
raise ValueError("Cannot fit tokenizer on empty text")
|
|
|
|
print(f"🔍 Analyzing text for vocabulary...")
|
|
print(f" Text length: {len(text):,} characters")
|
|
|
|
# Count character frequencies
|
|
self.character_counts = {}
|
|
for char in text:
|
|
self.character_counts[char] = self.character_counts.get(char, 0) + 1
|
|
|
|
unique_chars = len(self.character_counts)
|
|
print(f" Unique characters found: {unique_chars}")
|
|
|
|
# Start building vocabulary with special tokens
|
|
self.char_to_idx = {}
|
|
self.idx_to_char = {}
|
|
|
|
# Add special tokens first (ensures consistent indices)
|
|
for i, token in enumerate(self.special_tokens):
|
|
self.char_to_idx[token] = i
|
|
self.idx_to_char[i] = token
|
|
|
|
# Set special token indices
|
|
self.unk_idx = self.char_to_idx[self.unk_token]
|
|
self.pad_idx = self.char_to_idx[self.pad_token]
|
|
|
|
# Sort characters by frequency (most frequent first)
|
|
sorted_chars = sorted(self.character_counts.items(),
|
|
key=lambda x: x[1], reverse=True)
|
|
|
|
# Add characters to vocabulary up to limit
|
|
current_idx = len(self.special_tokens)
|
|
chars_added = 0
|
|
|
|
for char, count in sorted_chars:
|
|
# Skip if already in vocabulary (shouldn't happen with char-level)
|
|
if char in self.char_to_idx:
|
|
continue
|
|
|
|
# Check vocab size limit
|
|
if self.vocab_size and current_idx >= self.vocab_size:
|
|
break
|
|
|
|
self.char_to_idx[char] = current_idx
|
|
self.idx_to_char[current_idx] = char
|
|
current_idx += 1
|
|
chars_added += 1
|
|
|
|
self.is_fitted = True
|
|
|
|
print(f"✅ Vocabulary built successfully:")
|
|
print(f" Final vocab size: {len(self.char_to_idx)}")
|
|
print(f" Characters included: {chars_added}")
|
|
if self.vocab_size and chars_added < unique_chars:
|
|
excluded = unique_chars - chars_added
|
|
print(f" Characters excluded: {excluded} (will map to <UNK>)")
|
|
|
|
# Show most frequent characters
|
|
print(f" Most frequent: {sorted_chars[:10]}")
|
|
|
|
def encode(self, text: str) -> List[int]:
|
|
"""Convert text to sequence of token indices.
|
|
|
|
Args:
|
|
text: Text to encode
|
|
|
|
Returns:
|
|
List of token indices
|
|
|
|
Educational Note:
|
|
Characters not in vocabulary are mapped to <UNK> token.
|
|
This handles rare characters and maintains fixed vocabulary size.
|
|
"""
|
|
if not self.is_fitted:
|
|
raise RuntimeError("Tokenizer must be fitted before encoding")
|
|
|
|
if not text:
|
|
return []
|
|
|
|
indices = []
|
|
unk_count = 0
|
|
|
|
for char in text:
|
|
if char in self.char_to_idx:
|
|
indices.append(self.char_to_idx[char])
|
|
else:
|
|
indices.append(self.unk_idx)
|
|
unk_count += 1
|
|
|
|
if unk_count > 0:
|
|
unk_rate = unk_count / len(text) * 100
|
|
print(f"⚠️ Encoding: {unk_count} unknown chars ({unk_rate:.1f}%)")
|
|
|
|
return indices
|
|
|
|
def decode(self, indices: List[int]) -> str:
|
|
"""Convert sequence of token indices back to text.
|
|
|
|
Args:
|
|
indices: List of token indices to decode
|
|
|
|
Returns:
|
|
Decoded text string
|
|
|
|
Educational Note:
|
|
Invalid indices are skipped to handle generation errors gracefully.
|
|
"""
|
|
if not self.is_fitted:
|
|
raise RuntimeError("Tokenizer must be fitted before decoding")
|
|
|
|
if not indices:
|
|
return ""
|
|
|
|
chars = []
|
|
invalid_count = 0
|
|
|
|
for idx in indices:
|
|
if idx in self.idx_to_char:
|
|
char = self.idx_to_char[idx]
|
|
# Skip special tokens in decoded output (except space-like chars)
|
|
if char not in [self.pad_token]: # Keep <UNK> for debugging
|
|
chars.append(char)
|
|
else:
|
|
invalid_count += 1
|
|
|
|
if invalid_count > 0:
|
|
print(f"⚠️ Decoding: {invalid_count} invalid indices skipped")
|
|
|
|
return ''.join(chars)
|
|
|
|
def get_vocab_size(self) -> int:
|
|
"""Get the current vocabulary size.
|
|
|
|
Returns:
|
|
Number of tokens in vocabulary
|
|
"""
|
|
return len(self.char_to_idx)
|
|
|
|
def encode_batch(self, texts: List[str], max_length: Optional[int] = None,
|
|
padding: bool = True, truncation: bool = True) -> np.ndarray:
|
|
"""Encode batch of texts with optional padding and truncation.
|
|
|
|
Args:
|
|
texts: List of texts to encode
|
|
max_length: Maximum sequence length (None = longest in batch)
|
|
padding: Whether to pad sequences to max_length
|
|
truncation: Whether to truncate sequences to max_length
|
|
|
|
Returns:
|
|
2D numpy array of shape (batch_size, max_length)
|
|
|
|
Educational Benefits:
|
|
- Demonstrates batch processing for efficiency
|
|
- Shows padding/truncation strategies for variable length sequences
|
|
- Prepares data in format expected by neural networks
|
|
"""
|
|
if not self.is_fitted:
|
|
raise RuntimeError("Tokenizer must be fitted before encoding")
|
|
|
|
if not texts:
|
|
return np.array([])
|
|
|
|
# Encode all texts
|
|
encoded_texts = [self.encode(text) for text in texts]
|
|
|
|
# Determine max length
|
|
if max_length is None:
|
|
max_length = max(len(encoded) for encoded in encoded_texts)
|
|
|
|
# Prepare batch array
|
|
batch_size = len(texts)
|
|
batch_array = np.full((batch_size, max_length), self.pad_idx, dtype=np.int32)
|
|
|
|
# Fill batch array
|
|
for i, encoded in enumerate(encoded_texts):
|
|
if truncation and len(encoded) > max_length:
|
|
# Truncate from the end
|
|
sequence = encoded[:max_length]
|
|
else:
|
|
sequence = encoded
|
|
|
|
# Copy sequence into batch array
|
|
seq_len = min(len(sequence), max_length)
|
|
batch_array[i, :seq_len] = sequence[:seq_len]
|
|
|
|
return batch_array
|
|
|
|
def get_vocabulary(self) -> Dict[str, int]:
|
|
"""Get the complete vocabulary mapping.
|
|
|
|
Returns:
|
|
Dictionary mapping characters to indices
|
|
"""
|
|
return self.char_to_idx.copy()
|
|
|
|
def get_special_tokens(self) -> Dict[str, int]:
|
|
"""Get special token mappings.
|
|
|
|
Returns:
|
|
Dictionary mapping special tokens to indices
|
|
"""
|
|
return {token: self.char_to_idx[token] for token in self.special_tokens}
|
|
|
|
def analyze_text(self, text: str) -> Dict[str, Union[int, float]]:
|
|
"""Analyze text with current vocabulary.
|
|
|
|
Args:
|
|
text: Text to analyze
|
|
|
|
Returns:
|
|
Dictionary with analysis statistics
|
|
|
|
Educational Purpose:
|
|
Helps understand vocabulary coverage and tokenization quality.
|
|
"""
|
|
if not self.is_fitted:
|
|
raise RuntimeError("Tokenizer must be fitted before analysis")
|
|
|
|
if not text:
|
|
return {'length': 0, 'tokens': 0, 'coverage': 0.0, 'unk_rate': 0.0}
|
|
|
|
indices = self.encode(text)
|
|
unk_count = sum(1 for idx in indices if idx == self.unk_idx)
|
|
|
|
stats = {
|
|
'length': len(text),
|
|
'tokens': len(indices),
|
|
'unique_chars': len(set(text)),
|
|
'vocab_coverage': len(set(text) & set(self.char_to_idx.keys())),
|
|
'unk_count': unk_count,
|
|
'unk_rate': unk_count / len(indices) * 100 if indices else 0.0,
|
|
'compression_ratio': len(text) / len(indices) if indices else 0.0
|
|
}
|
|
|
|
return stats
|
|
|
|
def save_vocabulary(self, filepath: str) -> None:
|
|
"""Save vocabulary to file for reuse.
|
|
|
|
Args:
|
|
filepath: Path to save vocabulary file
|
|
|
|
Educational Note:
|
|
In production, you'd want to save/load vocabularies to ensure
|
|
consistency across training and inference.
|
|
"""
|
|
import json
|
|
|
|
if not self.is_fitted:
|
|
raise RuntimeError("Cannot save unfitted tokenizer")
|
|
|
|
vocab_data = {
|
|
'char_to_idx': self.char_to_idx,
|
|
'special_tokens': self.special_tokens,
|
|
'vocab_size': self.vocab_size,
|
|
'character_counts': self.character_counts
|
|
}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(vocab_data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"💾 Vocabulary saved to {filepath}")
|
|
|
|
def load_vocabulary(self, filepath: str) -> None:
|
|
"""Load vocabulary from file.
|
|
|
|
Args:
|
|
filepath: Path to vocabulary file
|
|
"""
|
|
import json
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
vocab_data = json.load(f)
|
|
|
|
self.char_to_idx = vocab_data['char_to_idx']
|
|
self.special_tokens = vocab_data['special_tokens']
|
|
self.vocab_size = vocab_data['vocab_size']
|
|
self.character_counts = vocab_data['character_counts']
|
|
|
|
# Rebuild reverse mapping
|
|
self.idx_to_char = {int(idx): char for char, idx in self.char_to_idx.items()}
|
|
|
|
# Set special token indices
|
|
self.unk_idx = self.char_to_idx[self.unk_token]
|
|
self.pad_idx = self.char_to_idx[self.pad_token]
|
|
|
|
self.is_fitted = True
|
|
|
|
print(f"📁 Vocabulary loaded from {filepath}")
|
|
print(f" Vocab size: {len(self.char_to_idx)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test the CharTokenizer
|
|
print("🧪 Testing CharTokenizer")
|
|
print("=" * 50)
|
|
|
|
# Sample text for testing
|
|
sample_text = """To be, or not to be, that is the question:
|
|
Whether 'tis nobler in the mind to suffer
|
|
The slings and arrows of outrageous fortune,
|
|
Or to take arms against a sea of troubles
|
|
And by opposing end them."""
|
|
|
|
print(f"📝 Sample text ({len(sample_text)} chars):")
|
|
print(f"'{sample_text[:100]}...'")
|
|
print()
|
|
|
|
# Test basic tokenization
|
|
print("🔤 Basic Tokenization Test:")
|
|
tokenizer = CharTokenizer(vocab_size=50)
|
|
tokenizer.fit(sample_text)
|
|
print()
|
|
|
|
# Test encoding/decoding
|
|
test_phrase = "To be or not to be"
|
|
print(f"🔬 Encoding/Decoding Test:")
|
|
print(f"Original: '{test_phrase}'")
|
|
|
|
encoded = tokenizer.encode(test_phrase)
|
|
print(f"Encoded: {encoded}")
|
|
|
|
decoded = tokenizer.decode(encoded)
|
|
print(f"Decoded: '{decoded}'")
|
|
|
|
print(f"Round-trip successful: {test_phrase == decoded}")
|
|
print()
|
|
|
|
# Test batch encoding
|
|
print("📦 Batch Encoding Test:")
|
|
batch_texts = [
|
|
"To be",
|
|
"or not to be",
|
|
"that is the question"
|
|
]
|
|
|
|
batch_encoded = tokenizer.encode_batch(batch_texts, max_length=20)
|
|
print(f"Batch shape: {batch_encoded.shape}")
|
|
print(f"Batch sample:\n{batch_encoded}")
|
|
print()
|
|
|
|
# Test vocabulary analysis
|
|
print("📊 Vocabulary Analysis:")
|
|
vocab = tokenizer.get_vocabulary()
|
|
special_tokens = tokenizer.get_special_tokens()
|
|
|
|
print(f"Total vocabulary size: {len(vocab)}")
|
|
print(f"Special tokens: {special_tokens}")
|
|
print(f"Sample characters: {list(vocab.keys())[:20]}")
|
|
print()
|
|
|
|
# Test text analysis
|
|
print("🔍 Text Analysis:")
|
|
stats = tokenizer.analyze_text(sample_text)
|
|
for key, value in stats.items():
|
|
if isinstance(value, float):
|
|
print(f" {key}: {value:.2f}")
|
|
else:
|
|
print(f" {key}: {value}")
|
|
print()
|
|
|
|
# Test with limited vocabulary
|
|
print("⚠️ Limited Vocabulary Test:")
|
|
small_tokenizer = CharTokenizer(vocab_size=10) # Very small vocab
|
|
small_tokenizer.fit("abcdefghijklmnopqrstuvwxyz")
|
|
|
|
test_text = "Hello, World!"
|
|
encoded_small = small_tokenizer.encode(test_text)
|
|
decoded_small = small_tokenizer.decode(encoded_small)
|
|
|
|
print(f"Original: '{test_text}'")
|
|
print(f"Decoded: '{decoded_small}'")
|
|
print(f"Small vocab size: {small_tokenizer.get_vocab_size()}")
|
|
print()
|
|
|
|
# Performance characteristics
|
|
print("⚡ Performance Characteristics:")
|
|
import time
|
|
|
|
# Encoding speed test
|
|
long_text = sample_text * 100 # Make it longer
|
|
start_time = time.time()
|
|
encoded_long = tokenizer.encode(long_text)
|
|
encoding_time = time.time() - start_time
|
|
|
|
# Decoding speed test
|
|
start_time = time.time()
|
|
decoded_long = tokenizer.decode(encoded_long)
|
|
decoding_time = time.time() - start_time
|
|
|
|
print(f"Text length: {len(long_text):,} chars")
|
|
print(f"Encoding time: {encoding_time:.4f}s ({len(long_text)/encoding_time:.0f} chars/s)")
|
|
print(f"Decoding time: {decoding_time:.4f}s ({len(encoded_long)/decoding_time:.0f} tokens/s)")
|
|
print()
|
|
|
|
print("✅ CharTokenizer tests completed!")
|
|
print("\n💡 Key insights:")
|
|
print(" • Character-level tokenization is simple and transparent")
|
|
print(" • Vocabulary size affects memory usage and unknown token rate")
|
|
print(" • Batch processing enables efficient neural network training")
|
|
print(" • Special tokens handle edge cases gracefully")
|
|
print(" • Round-trip encoding/decoding preserves text (when vocab is sufficient)")
|
|
print(" • 🎉 Ready for integration with TinyGPT!") |