diff --git a/milestones/05_2017_transformer/download_tinystories.py b/milestones/05_2017_transformer/download_tinystories.py deleted file mode 100755 index cf177197..00000000 --- a/milestones/05_2017_transformer/download_tinystories.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 -""" -Download and prepare TinyStories dataset for TinyTorch training. - -TinyStories is a dataset of simple, synthetic stories designed for -training small language models. It's much easier than Shakespeare! -""" - -import os -import urllib.request - -def download_tinystories(): - """Download TinyStories dataset.""" - - # Create data directory - data_dir = os.path.join(os.path.dirname(__file__), '../datasets/tinystories') - os.makedirs(data_dir, exist_ok=True) - - # TinyStories validation set (smaller, good for testing) - urls = { - 'tiny_val': 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-valid.txt', - 'tiny_train_small': 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories-train.txt' - } - - print("πŸ“₯ Downloading TinyStories dataset...") - print("="*70) - - # Start with validation set (much smaller for testing) - filename = 'tinystories_val.txt' - filepath = os.path.join(data_dir, filename) - - if os.path.exists(filepath): - print(f"βœ… {filename} already exists") - size = os.path.getsize(filepath) / (1024 * 1024) - print(f" Size: {size:.2f} MB") - else: - print(f"⬇️ Downloading {filename}...") - try: - urllib.request.urlretrieve(urls['tiny_val'], filepath) - size = os.path.getsize(filepath) / (1024 * 1024) - print(f"βœ… Downloaded! Size: {size:.2f} MB") - except Exception as e: - print(f"❌ Error downloading: {e}") - print("\nπŸ’‘ Alternative: Download manually from:") - print(f" {urls['tiny_val']}") - print(f" Save to: {filepath}") - return None - - # Read and show sample - with open(filepath, 'r', encoding='utf-8') as f: - text = f.read() - - print(f"\nπŸ“Š Dataset Stats:") - print(f" Total characters: {len(text):,}") - print(f" Total words: {len(text.split()):,}") - print(f" Unique characters: {len(set(text))}") - - # Show first story - stories = text.split('<|endoftext|>') - if len(stories) > 0: - first_story = stories[0].strip() - print(f"\nπŸ“– Sample Story:") - print(" " + "-"*66) - print(" " + first_story[:300].replace('\n', '\n ')) - if len(first_story) > 300: - print(" ...") - print(" " + "-"*66) - - print(f"\nβœ… TinyStories ready for training!") - print(f" Location: {filepath}") - - return filepath - -if __name__ == '__main__': - download_tinystories() diff --git a/milestones/05_2017_transformer/level1_memorization.py b/milestones/05_2017_transformer/level1_memorization.py deleted file mode 100644 index 9434c866..00000000 --- a/milestones/05_2017_transformer/level1_memorization.py +++ /dev/null @@ -1,338 +0,0 @@ -""" -Milestone 05 - Level 1: Transformer Memorization Test -====================================================== - -SIMPLEST POSSIBLE TRANSFORMER TEST: -Can the transformer memorize and reproduce simple sequences? - -Task: Given "ABCD", predict "BCDE" - Given "1234", predict "2345" - -Expected: -- Train in < 2 minutes -- Loss should drop from ~3.0 to < 0.1 -- Should perfectly predict next character - -This validates: -βœ“ Transformer architecture works -βœ“ Attention mechanism works -βœ“ Gradient flow works -βœ“ Training loop works -""" - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent.parent)) - -import numpy as np -import time -from tinytorch.core.tensor import Tensor -from tinytorch.core.autograd import enable_autograd -from tinytorch.core.optimizers import Adam -from tinytorch.core.losses import CrossEntropyLoss -from tinytorch.models.transformer import GPT - -enable_autograd() - -# ============================================================================ -# Level 1: Simple Memorization Dataset -# ============================================================================ - -def create_memorization_dataset(): - """ - Create ultra-simple sequences to memorize: - - Alphabet sequences: ABCD, EFGH, etc. - - Number sequences: 1234, 5678, etc. - - Pattern sequences: AAAA, BBBB, etc. - """ - sequences = [ - # Alphabet - "ABCDE", - "FGHIJ", - "KLMNO", - "PQRST", - "UVWXY", - # Numbers - "12345", - "67890", - # Patterns - "AAAAA", - "BBBBB", - "CCCCC", - # Mixed - "A1B2C", - "X9Y8Z", - ] - return sequences - - -def create_simple_tokenizer(sequences): - """Create character-level tokenizer for sequences.""" - # Get all unique characters - all_chars = sorted(set(''.join(sequences))) - - # Create mappings (0 is reserved for padding) - char_to_idx = {char: idx + 1 for idx, char in enumerate(all_chars)} - idx_to_char = {idx + 1: char for idx, char in enumerate(all_chars)} - char_to_idx[''] = 0 - idx_to_char[0] = '' - - return char_to_idx, idx_to_char - - -def encode_sequence(seq, char_to_idx, max_len=8): - """Encode sequence to token IDs.""" - tokens = [char_to_idx.get(c, 0) for c in seq] - # Pad to max_len - if len(tokens) < max_len: - tokens = tokens + [0] * (max_len - len(tokens)) - else: - tokens = tokens[:max_len] - return tokens - - -def decode_sequence(tokens, idx_to_char): - """Decode token IDs to string.""" - chars = [idx_to_char.get(t, '') for t in tokens if t != 0] - return ''.join(chars) - - -# ============================================================================ -# Training -# ============================================================================ - -def train_memorization(model, optimizer, loss_fn, train_data, vocab_size, max_steps=200): - """ - Train transformer to memorize sequences. - Target: < 2 minutes, loss < 0.1 - """ - print("=" * 70) - print("TRAINING LEVEL 1: MEMORIZATION") - print("=" * 70) - print(f"Dataset: {len(train_data)} sequences") - print(f"Vocab size: {vocab_size}") - print(f"Max steps: {max_steps}") - print(f"Target: Loss < 0.1 in < 2 minutes") - print() - - start_time = time.time() - losses = [] - - for step in range(max_steps): - # Sample random sequence - tokens = train_data[np.random.randint(len(train_data))] - - # Input: all but last token - # Target: all but first token (next token prediction) - input_seq = tokens[:-1] - target_seq = tokens[1:] - - # Convert to tensors - x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False) - y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False) - - # Forward pass - logits = model.forward(x) - - # Compute loss - batch_size, seq_len, vocab_size_out = logits.shape - logits_flat = logits.reshape(batch_size * seq_len, vocab_size_out) - targets_flat = y_true.reshape(batch_size * seq_len) - loss = loss_fn.forward(logits_flat, targets_flat) - - # Backward pass - optimizer.zero_grad() - loss.backward() - - # Clip gradients - for param in model.parameters(): - if param.grad is not None: - np.clip(param.grad, -1.0, 1.0, out=param.grad) - - # Update - optimizer.step() - - losses.append(loss.data.item()) - - # Progress every 50 steps - if step % 50 == 0: - avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - elapsed = time.time() - start_time - print(f"Step {step:4d}/{max_steps} | Loss: {avg_loss:.4f} | Time: {elapsed:.1f}s") - - # Early stopping - if avg_loss < 0.2: - print(f"\nβœ“ Target reached! Loss < 0.2 at step {step}") - break - - elapsed = time.time() - start_time - final_loss = np.mean(losses[-100:]) - initial_loss = np.mean(losses[:10]) - improvement = (1 - final_loss / initial_loss) * 100 - - print() - print("=" * 70) - print("TRAINING COMPLETE") - print("=" * 70) - print(f"Time: {elapsed:.1f} seconds") - print(f"Initial loss: {initial_loss:.4f}") - print(f"Final loss: {final_loss:.4f}") - print(f"Improvement: {improvement:.1f}%") - print() - - return losses - - -# ============================================================================ -# Testing -# ============================================================================ - -def test_memorization(model, test_sequences, char_to_idx, idx_to_char): - """ - Test if model can reproduce memorized sequences. - """ - print("=" * 70) - print("TESTING LEVEL 1: MEMORIZATION") - print("=" * 70) - print() - - correct = 0 - total = len(test_sequences) - - for seq in test_sequences: - # Encode - tokens = encode_sequence(seq, char_to_idx, max_len=8) - - # Get model predictions - x = Tensor(np.array([tokens[:-1]], dtype=np.int32), requires_grad=False) - logits = model.forward(x) - - # Decode predictions (greedy) - predicted_tokens = [] - for i in range(logits.shape[1]): - next_token = int(np.argmax(logits.data[0, i, :])) - predicted_tokens.append(next_token) - - # Compare - expected = tokens[1:] # Target sequence - predicted = predicted_tokens - - # Check if match (ignoring padding) - match = True - for exp, pred in zip(expected, predicted): - if exp == 0: # Padding, stop checking - break - if exp != pred: - match = False - break - - if match: - correct += 1 - status = "βœ“" - else: - status = "βœ—" - - # Decode for display - expected_str = decode_sequence(expected, idx_to_char) - predicted_str = decode_sequence(predicted, idx_to_char) - - print(f"{status} Input: {seq[:4]:8s} β†’ Expected: {expected_str:8s} | Got: {predicted_str:8s}") - - accuracy = (correct / total) * 100 - print() - print(f"Accuracy: {correct}/{total} ({accuracy:.1f}%)") - print() - - if accuracy >= 90: - print("βœ“ LEVEL 1 PASSED: Transformer can memorize sequences!") - else: - print("βœ— LEVEL 1 FAILED: Needs more training or debugging") - - return accuracy - - -# ============================================================================ -# Main -# ============================================================================ - -def main(): - print() - print("=" * 70) - print("MILESTONE 05 - LEVEL 1: TRANSFORMER MEMORIZATION TEST") - print("=" * 70) - print() - print("Goal: Train transformer to memorize simple sequences in < 2 minutes") - print() - - # Create dataset - sequences = create_memorization_dataset() - char_to_idx, idx_to_char = create_simple_tokenizer(sequences) - vocab_size = len(idx_to_char) - - print(f"Dataset: {len(sequences)} sequences") - print(f"Vocabulary: {vocab_size} tokens") - print(f"Example: {sequences[0]} β†’ {encode_sequence(sequences[0], char_to_idx)}") - print() - - # Encode all sequences - train_data = [encode_sequence(seq, char_to_idx, max_len=8) for seq in sequences] - - # Create ULTRA-tiny model for speed - config = { - 'vocab_size': vocab_size, - 'embed_dim': 16, # Super tiny! - 'num_layers': 1, # Just 1 layer - 'num_heads': 2, # 2 heads - 'max_seq_len': 8, # Short sequences - } - - print("Model configuration:") - for key, val in config.items(): - print(f" {key}: {val}") - print() - - model = GPT(**config) - num_params = sum(np.prod(p.shape) for p in model.parameters()) - print(f"Parameters: {num_params:,}") - print() - - # Optimizer and loss - optimizer = Adam(model.parameters(), lr=0.001) - loss_fn = CrossEntropyLoss() - - # Train - print("Starting training...") - print() - losses = train_memorization( - model=model, - optimizer=optimizer, - loss_fn=loss_fn, - train_data=train_data, - vocab_size=vocab_size, - max_steps=200 # Reduced for speed (ultra-tiny model) - ) - - # Test - print("Starting testing...") - print() - accuracy = test_memorization(model, sequences, char_to_idx, idx_to_char) - - # Summary - print("=" * 70) - print("LEVEL 1 SUMMARY") - print("=" * 70) - print(f"βœ“ Training: {len(losses)} steps") - print(f"βœ“ Loss: {np.mean(losses[:10]):.4f} β†’ {np.mean(losses[-100:]):.4f}") - print(f"βœ“ Accuracy: {accuracy:.1f}%") - print() - - if accuracy >= 90: - print("πŸŽ‰ LEVEL 1 COMPLETE! Ready for Level 2: Pattern Completion") - else: - print("⚠️ LEVEL 1 INCOMPLETE: Needs debugging") - print() - - -if __name__ == "__main__": - main() - diff --git a/milestones/05_2017_transformer/level2_patterns.py b/milestones/05_2017_transformer/level2_patterns.py deleted file mode 100644 index e7fce222..00000000 --- a/milestones/05_2017_transformer/level2_patterns.py +++ /dev/null @@ -1,357 +0,0 @@ -""" -Milestone 05 - Level 2: Transformer Pattern Completion -======================================================= - -SIMPLE PATTERN COMPLETION TEST: -Can the transformer learn to complete simple patterns? - -Task: Given "A B C", predict "D" - Given "1 2 3", predict "4" - Given "do re mi", predict "fa" - -Expected: -- Train in < 5 minutes -- Loss should drop from ~3.0 to < 0.5 -- Should complete 70%+ of patterns correctly - -This validates: -βœ“ Transformer can learn relationships -βœ“ Attention mechanism captures patterns -βœ“ Model generalizes beyond memorization -""" - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent.parent)) - -import numpy as np -import time -from tinytorch.core.tensor import Tensor -from tinytorch.core.autograd import enable_autograd -from tinytorch.core.optimizers import Adam -from tinytorch.core.losses import CrossEntropyLoss -from tinytorch.models.transformer import GPT - -enable_autograd() - -# ============================================================================ -# Level 2: Pattern Completion Dataset -# ============================================================================ - -def create_pattern_dataset(): - """ - Create simple completion patterns: - - Sequences: A B C β†’ D - - Counting: 1 2 3 β†’ 4 - - Musical: do re mi β†’ fa - """ - patterns = [ - # Alphabet sequences - ("A B C", "D"), - ("D E F", "G"), - ("M N O", "P"), - ("W X Y", "Z"), - # Numbers - ("1 2 3", "4"), - ("5 6 7", "8"), - # Words (short) - ("cat dog", "rat"), - ("up down", "left"), - # Repetition - ("A A A", "A"), - ("B B B", "B"), - ("1 1 1", "1"), - ] - return patterns - - -def create_tokenizer(patterns): - """Create character-level tokenizer.""" - # Get all unique characters - all_text = ' '.join([p[0] + ' ' + p[1] for p in patterns]) - all_chars = sorted(set(all_text)) - - # Create mappings (0 = padding, 1 = EOS) - char_to_idx = {char: idx + 2 for idx, char in enumerate(all_chars)} - idx_to_char = {idx + 2: char for idx, char in enumerate(all_chars)} - char_to_idx[''] = 0 - char_to_idx[''] = 1 - idx_to_char[0] = '' - idx_to_char[1] = '' - - return char_to_idx, idx_to_char - - -def encode_pattern(input_str, target_str, char_to_idx, max_len=16): - """Encode pattern as: input + + target + , then pad.""" - # Encode input - input_tokens = [char_to_idx.get(c, 0) for c in input_str] - input_tokens.append(1) # EOS - - # Encode target - target_tokens = [char_to_idx.get(c, 0) for c in target_str] - target_tokens.append(1) # EOS - - # Combine - tokens = input_tokens + target_tokens - - # Pad - if len(tokens) < max_len: - tokens = tokens + [0] * (max_len - len(tokens)) - else: - tokens = tokens[:max_len] - - return tokens - - -def decode_tokens(tokens, idx_to_char): - """Decode tokens to string.""" - chars = [] - for t in tokens: - if t == 0: # padding - break - if t == 1: # EOS - break - chars.append(idx_to_char.get(t, '?')) - return ''.join(chars) - - -# ============================================================================ -# Training -# ============================================================================ - -def train_patterns(model, optimizer, loss_fn, train_data, vocab_size, max_steps=400): - """ - Train transformer to complete patterns. - Target: < 5 minutes, loss < 0.5 - """ - print("=" * 70) - print("TRAINING LEVEL 2: PATTERN COMPLETION") - print("=" * 70) - print(f"Dataset: {len(train_data)} patterns") - print(f"Vocab size: {vocab_size}") - print(f"Max steps: {max_steps}") - print(f"Target: Loss < 0.5 in < 5 minutes") - print() - - start_time = time.time() - losses = [] - - for step in range(max_steps): - # Sample random pattern - tokens = train_data[np.random.randint(len(train_data))] - - # Input: all but last - # Target: all but first (shifted by 1) - input_seq = tokens[:-1] - target_seq = tokens[1:] - - # Convert to tensors - x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False) - y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False) - - # Forward pass - logits = model.forward(x) - - # Compute loss - batch_size, seq_len, vocab_size_out = logits.shape - logits_flat = logits.reshape(batch_size * seq_len, vocab_size_out) - targets_flat = y_true.reshape(batch_size * seq_len) - loss = loss_fn.forward(logits_flat, targets_flat) - - # Backward pass - optimizer.zero_grad() - loss.backward() - - # Clip gradients - for param in model.parameters(): - if param.grad is not None: - np.clip(param.grad, -1.0, 1.0, out=param.grad) - - # Update - optimizer.step() - - losses.append(loss.data.item()) - - # Progress every 50 steps - if step % 50 == 0 or step == max_steps - 1: - avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - elapsed = time.time() - start_time - print(f"Step {step:4d}/{max_steps} | Loss: {avg_loss:.4f} | Time: {elapsed:.1f}s") - - # Early stopping - if avg_loss < 0.5: - print(f"\nβœ“ Target reached! Loss < 0.5 at step {step}") - break - - elapsed = time.time() - start_time - final_loss = np.mean(losses[-100:]) - initial_loss = np.mean(losses[:10]) - improvement = (1 - final_loss / initial_loss) * 100 - - print() - print("=" * 70) - print("TRAINING COMPLETE") - print("=" * 70) - print(f"Time: {elapsed:.1f} seconds") - print(f"Initial loss: {initial_loss:.4f}") - print(f"Final loss: {final_loss:.4f}") - print(f"Improvement: {improvement:.1f}%") - print() - - return losses - - -# ============================================================================ -# Testing -# ============================================================================ - -def test_patterns(model, test_patterns, char_to_idx, idx_to_char, max_len=16): - """ - Test if model can complete patterns. - """ - print("=" * 70) - print("TESTING LEVEL 2: PATTERN COMPLETION") - print("=" * 70) - print() - - correct = 0 - total = len(test_patterns) - - for input_str, expected_target in test_patterns: - # Encode input + EOS - input_tokens = [char_to_idx.get(c, 0) for c in input_str] - input_tokens.append(1) # EOS - - # Pad to max_len-1 (leave room for generation) - while len(input_tokens) < max_len - 1: - input_tokens.append(0) - input_tokens = input_tokens[:max_len-1] - - # Forward pass - x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False) - logits = model.forward(x) - - # Get prediction for next token (after input + EOS) - input_len = len([c for c in input_str]) + 1 # +1 for EOS - if input_len < len(input_tokens): - next_token_logits = logits.data[0, input_len - 1, :] # Predict position after EOS - predicted_token = int(np.argmax(next_token_logits)) - - # Decode - predicted_char = idx_to_char.get(predicted_token, '?') - - # Check if correct (compare first character of target) - expected_first_char = expected_target[0] if len(expected_target) > 0 else '' - match = (predicted_char == expected_first_char) - else: - match = False - predicted_char = '?' - - if match: - correct += 1 - status = "βœ“" - else: - status = "βœ—" - - print(f"{status} Input: \"{input_str:12s}\" β†’ Expected: \"{expected_target:6s}\" | Got: \"{predicted_char}\"") - - accuracy = (correct / total) * 100 - print() - print(f"Accuracy: {correct}/{total} ({accuracy:.1f}%)") - print() - - if accuracy >= 70: - print("βœ“ LEVEL 2 PASSED: Transformer can complete patterns!") - else: - print("βœ— LEVEL 2 FAILED: Needs more training") - - return accuracy - - -# ============================================================================ -# Main -# ============================================================================ - -def main(): - print() - print("=" * 70) - print("MILESTONE 05 - LEVEL 2: TRANSFORMER PATTERN COMPLETION") - print("=" * 70) - print() - print("Goal: Train transformer to complete patterns in < 5 minutes") - print() - - # Create dataset - patterns = create_pattern_dataset() - char_to_idx, idx_to_char = create_tokenizer(patterns) - vocab_size = len(idx_to_char) - - print(f"Dataset: {len(patterns)} patterns") - print(f"Vocabulary: {vocab_size} tokens") - print(f"Example: \"{patterns[0][0]}\" β†’ \"{patterns[0][1]}\"") - print() - - # Encode all patterns - max_len = 16 - train_data = [encode_pattern(inp, out, char_to_idx, max_len) for inp, out in patterns] - - # Create small model (bigger than Level 1) - config = { - 'vocab_size': vocab_size, - 'embed_dim': 24, # Slightly bigger - 'num_layers': 2, # 2 layers - 'num_heads': 2, # 2 heads - 'max_seq_len': max_len, - } - - print("Model configuration:") - for key, val in config.items(): - print(f" {key}: {val}") - print() - - model = GPT(**config) - num_params = sum(np.prod(p.shape) for p in model.parameters()) - print(f"Parameters: {num_params:,}") - print() - - # Optimizer and loss - optimizer = Adam(model.parameters(), lr=0.001) - loss_fn = CrossEntropyLoss() - - # Train - print("Starting training...") - print() - losses = train_patterns( - model=model, - optimizer=optimizer, - loss_fn=loss_fn, - train_data=train_data, - vocab_size=vocab_size, - max_steps=400 - ) - - # Test - print("Starting testing...") - print() - accuracy = test_patterns(model, patterns, char_to_idx, idx_to_char, max_len) - - # Summary - print("=" * 70) - print("LEVEL 2 SUMMARY") - print("=" * 70) - print(f"βœ“ Training: {len(losses)} steps") - print(f"βœ“ Loss: {np.mean(losses[:10]):.4f} β†’ {np.mean(losses[-100:]):.4f}") - print(f"βœ“ Accuracy: {accuracy:.1f}%") - print() - - if accuracy >= 70: - print("πŸŽ‰ LEVEL 2 COMPLETE! Ready for Level 3: Text Generation") - else: - print("⚠️ LEVEL 2 INCOMPLETE: Needs more training") - print() - - -if __name__ == "__main__": - main() - diff --git a/milestones/05_2017_transformer/simple_gpt.py b/milestones/05_2017_transformer/simple_gpt.py deleted file mode 100644 index 48b4f638..00000000 --- a/milestones/05_2017_transformer/simple_gpt.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -Simple GPT model for CodeBot milestone - bypasses LayerNorm gradient bug. - -This is a workaround for the milestone until core Tensor operations -(subtraction, mean) are fixed to maintain gradient flow. -""" - -import numpy as np -from tinytorch.core.tensor import Tensor -from tinytorch.core.layers import Linear -from tinytorch.core.attention import MultiHeadAttention -from tinytorch.core.activations import GELU -from tinytorch.text.embeddings import Embedding - - -class SimpleGPT: - """ - Simplified GPT without LayerNorm (workaround for gradient flow bugs). - - Architecture: - - Token + Position embeddings - - N transformer blocks (attention + MLP, NO LayerNorm) - - Output projection to vocabulary - - Note: This is a temporary solution for the milestone. The full GPT - with LayerNorm requires fixes to core Tensor subtraction/mean operations. - """ - - def __init__( - self, - vocab_size: int, - embed_dim: int, - num_layers: int, - num_heads: int, - max_seq_len: int, - mlp_ratio: int = 4 - ): - self.vocab_size = vocab_size - self.embed_dim = embed_dim - self.num_layers = num_layers - self.num_heads = num_heads - self.max_seq_len = max_seq_len - - # Embeddings - self.token_embedding = Embedding(vocab_size, embed_dim) - self.position_embedding = Embedding(max_seq_len, embed_dim) - - # Transformer blocks (simplified - no LayerNorm) - self.blocks = [] - for _ in range(num_layers): - block = { - 'attention': MultiHeadAttention(embed_dim, num_heads), - 'mlp_fc1': Linear(embed_dim, embed_dim * mlp_ratio), - 'mlp_gelu': GELU(), # Use tinytorch's GELU - 'mlp_fc2': Linear(embed_dim * mlp_ratio, embed_dim), - } - self.blocks.append(block) - - # Output projection - self.lm_head = Linear(embed_dim, vocab_size) - - def forward(self, tokens: Tensor) -> Tensor: - """ - Forward pass through simplified GPT. - - Args: - tokens: Token indices, shape (batch_size, seq_len) - - Returns: - logits: Predictions, shape (batch_size, seq_len, vocab_size) - """ - batch_size, seq_len = tokens.shape - - # Embeddings - token_emb = self.token_embedding.forward(tokens) - positions = Tensor(np.arange(seq_len).reshape(1, seq_len)) - pos_emb = self.position_embedding.forward(positions) - x = token_emb + pos_emb # (batch, seq, embed) - - # Transformer blocks - for block in self.blocks: - # Self-attention with residual - attn_out = block['attention'].forward(x) - x = x + attn_out # Residual connection - - # MLP with residual - mlp_out = block['mlp_fc1'].forward(x) - mlp_out = block['mlp_gelu'].forward(mlp_out) # Activation - mlp_out = block['mlp_fc2'].forward(mlp_out) - x = x + mlp_out # Residual connection - - # Project to vocabulary - logits = self.lm_head.forward(x) - return logits - - def parameters(self): - """Return all trainable parameters.""" - params = [] - params.extend(self.token_embedding.parameters()) - params.extend(self.position_embedding.parameters()) - - for block in self.blocks: - params.extend(block['attention'].parameters()) - params.extend(block['mlp_fc1'].parameters()) - params.extend(block['mlp_fc2'].parameters()) - - params.extend(self.lm_head.parameters()) - return params - diff --git a/milestones/05_2017_transformer/test_5min_training.py b/milestones/05_2017_transformer/test_5min_training.py deleted file mode 100644 index 45ff9cc1..00000000 --- a/milestones/05_2017_transformer/test_5min_training.py +++ /dev/null @@ -1,316 +0,0 @@ -""" -Milestone 05 - 5-Minute Training Test -====================================== - -GOAL: Train the best possible transformer in exactly 5 minutes. - -We'll optimize for: -- Maximum learning in 5 minutes -- Clear progress visualization -- Actual generation testing -- Student-friendly output - -This will show what's realistically achievable in a classroom demo. -""" - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent.parent)) - -import numpy as np -import time -from tinytorch.core.tensor import Tensor -from tinytorch.core.autograd import enable_autograd -from tinytorch.core.optimizers import Adam -from tinytorch.core.losses import CrossEntropyLoss -from tinytorch.models.transformer import GPT - -enable_autograd() - -# ============================================================================ -# Dataset: Mix of memorization + patterns -# ============================================================================ - -def create_dataset(): - """Create a diverse but simple dataset.""" - sequences = [ - # Easy memorization - "AAAA", "BBBB", "CCCC", "1111", "2222", - # Simple sequences - "ABCD", "EFGH", "IJKL", "MNOP", "QRST", - "1234", "5678", "9012", - # Patterns (with repetition for learning) - "AB", "CD", "EF", "GH", - "12", "34", "56", "78", - ] * 3 # Triple the dataset for better learning - return sequences - - -def create_tokenizer(sequences): - """Simple character tokenizer.""" - all_chars = sorted(set(''.join(sequences))) - char_to_idx = {char: idx + 1 for idx, char in enumerate(all_chars)} - idx_to_char = {idx + 1: char for idx, char in enumerate(all_chars)} - char_to_idx[''] = 0 - idx_to_char[0] = '' - return char_to_idx, idx_to_char - - -def encode(seq, char_to_idx, max_len=10): - """Encode and pad sequence.""" - tokens = [char_to_idx.get(c, 0) for c in seq] - if len(tokens) < max_len: - tokens = tokens + [0] * (max_len - len(tokens)) - else: - tokens = tokens[:max_len] - return tokens - - -def decode(tokens, idx_to_char): - """Decode tokens to string.""" - return ''.join([idx_to_char.get(t, '') for t in tokens if t != 0]) - - -# ============================================================================ -# Training with 5-minute time limit -# ============================================================================ - -def train_5_minutes(model, optimizer, loss_fn, train_data, max_time_seconds=300): - """ - Train for exactly 5 minutes, show progress throughout. - """ - print("=" * 70) - print("TRAINING FOR 5 MINUTES") - print("=" * 70) - print(f"Dataset: {len(train_data)} sequences") - print(f"Time limit: {max_time_seconds}s ({max_time_seconds/60:.1f} minutes)") - print() - - start_time = time.time() - losses = [] - step = 0 - - # Progress checkpoints at 1, 2, 3, 4, 5 minutes - checkpoints = [60, 120, 180, 240, 300] - checkpoint_idx = 0 - - print("Training started...") - print() - - while True: - # Check time limit - elapsed = time.time() - start_time - if elapsed >= max_time_seconds: - break - - # Sample random sequence - tokens = train_data[np.random.randint(len(train_data))] - - # Next token prediction - input_seq = tokens[:-1] - target_seq = tokens[1:] - - x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False) - y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False) - - # Forward - logits = model.forward(x) - - # Loss - batch_size, seq_len, vocab_size = logits.shape - logits_flat = logits.reshape(batch_size * seq_len, vocab_size) - targets_flat = y_true.reshape(batch_size * seq_len) - loss = loss_fn.forward(logits_flat, targets_flat) - - # Backward - optimizer.zero_grad() - loss.backward() - - # Clip gradients - for param in model.parameters(): - if param.grad is not None: - np.clip(param.grad, -1.0, 1.0, out=param.grad) - - # Update - optimizer.step() - - losses.append(loss.data.item()) - step += 1 - - # Show progress at checkpoints - if checkpoint_idx < len(checkpoints) and elapsed >= checkpoints[checkpoint_idx]: - avg_loss = np.mean(losses[-50:]) if len(losses) >= 50 else np.mean(losses) - steps_per_sec = step / elapsed - print(f"[{int(elapsed):3d}s] Step {step:4d} | Loss: {avg_loss:.4f} | Speed: {steps_per_sec:.2f} steps/sec") - checkpoint_idx += 1 - - # Also show every 50 steps if we're going fast - if step % 50 == 0: - if checkpoint_idx == 0 or elapsed < checkpoints[0]: # Only if we haven't hit first checkpoint - avg_loss = np.mean(losses[-50:]) if len(losses) >= 50 else np.mean(losses) - print(f"[{int(elapsed):3d}s] Step {step:4d} | Loss: {avg_loss:.4f}") - - final_elapsed = time.time() - start_time - final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - initial_loss = np.mean(losses[:10]) - improvement = (1 - final_loss / initial_loss) * 100 - - print() - print("=" * 70) - print("TRAINING COMPLETE") - print("=" * 70) - print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.2f} minutes)") - print(f"Total steps: {step}") - print(f"Steps/second: {step/final_elapsed:.2f}") - print(f"Initial loss: {initial_loss:.4f}") - print(f"Final loss: {final_loss:.4f}") - print(f"Improvement: {improvement:.1f}%") - print() - - return losses, step - - -# ============================================================================ -# Testing -# ============================================================================ - -def test_generation(model, test_sequences, char_to_idx, idx_to_char): - """Test generation quality.""" - print("=" * 70) - print("TESTING GENERATION") - print("=" * 70) - print() - - correct = 0 - total = len(test_sequences) - - for seq in test_sequences[:15]: # Test first 15 - tokens = encode(seq, char_to_idx, max_len=10) - - # Get predictions - x = Tensor(np.array([tokens[:-1]], dtype=np.int32), requires_grad=False) - logits = model.forward(x) - - # Predict each position - predicted_tokens = [] - for i in range(logits.shape[1]): - pred = int(np.argmax(logits.data[0, i, :])) - predicted_tokens.append(pred) - - # Compare - expected = tokens[1:] - match = all(e == p for e, p in zip(expected, predicted_tokens) if e != 0) - - if match: - correct += 1 - status = "βœ“" - else: - status = "βœ—" - - expected_str = decode(expected, idx_to_char) - predicted_str = decode(predicted_tokens, idx_to_char) - - print(f"{status} Input: {seq[:6]:8s} β†’ Expected: {expected_str:8s} | Got: {predicted_str:8s}") - - accuracy = (correct / 15) * 100 # Out of 15 tested - print() - print(f"Accuracy: {correct}/15 ({accuracy:.1f}%)") - print() - - return accuracy - - -# ============================================================================ -# Main -# ============================================================================ - -def main(): - print() - print("=" * 70) - print("MILESTONE 05 - 5-MINUTE TRAINING TEST") - print("=" * 70) - print() - print("Let's find out what we can learn in exactly 5 minutes!") - print() - - # Dataset - sequences = create_dataset() - char_to_idx, idx_to_char = create_tokenizer(sequences) - vocab_size = len(idx_to_char) - - print(f"Dataset: {len(sequences)} sequences (with repetition)") - print(f"Unique sequences: {len(set(sequences))}") - print(f"Vocabulary: {vocab_size} tokens") - print() - - # Encode - train_data = [encode(seq, char_to_idx, max_len=10) for seq in sequences] - - # Model: Ultra-tiny for maximum steps in 5 mins - # Goal: <1s per step β†’ ~300+ steps in 5 mins - # Strategy: Minimize params for speed - config = { - 'vocab_size': vocab_size, - 'embed_dim': 16, # Very small - 'num_layers': 1, # Just 1 layer! - 'num_heads': 2, # 2 heads - 'max_seq_len': 10, - } - - print("Model configuration:") - for key, val in config.items(): - print(f" {key}: {val}") - print() - - model = GPT(**config) - num_params = sum(np.prod(p.shape) for p in model.parameters()) - print(f"Parameters: {num_params:,}") - print() - - # Optimizer - optimizer = Adam(model.parameters(), lr=0.001) - loss_fn = CrossEntropyLoss() - - # Train for 5 minutes - print("Starting 5-minute training run...") - print("(Progress will be shown every minute)") - print() - - losses, total_steps = train_5_minutes( - model=model, - optimizer=optimizer, - loss_fn=loss_fn, - train_data=train_data, - max_time_seconds=300 # 5 minutes - ) - - # Test - print("Testing what the model learned...") - print() - accuracy = test_generation(model, sequences, char_to_idx, idx_to_char) - - # Final summary - print("=" * 70) - print("5-MINUTE TRAINING SUMMARY") - print("=" * 70) - print(f"βœ“ Model: {num_params:,} parameters") - print(f"βœ“ Steps completed: {total_steps}") - print(f"βœ“ Loss: {np.mean(losses[:10]):.4f} β†’ {np.mean(losses[-100:]):.4f}") - print(f"βœ“ Improvement: {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}%") - print(f"βœ“ Accuracy: {accuracy:.1f}%") - print() - - if accuracy >= 60: - print("πŸŽ‰ EXCELLENT! Model learned well in 5 minutes!") - elif accuracy >= 40: - print("βœ“ GOOD! Model is learning, could use more training.") - elif accuracy >= 20: - print("⚠️ FAIR: Model is learning but needs optimization.") - else: - print("⚠️ Model needs more training time or tuning.") - print() - - -if __name__ == "__main__": - main() - diff --git a/milestones/05_2017_transformer/test_gpt_learning.py b/milestones/05_2017_transformer/test_gpt_learning.py deleted file mode 100644 index b358942f..00000000 --- a/milestones/05_2017_transformer/test_gpt_learning.py +++ /dev/null @@ -1,744 +0,0 @@ -#!/usr/bin/env python3 -""" -Progressive Test Suite for TinyGPT Learning - -Tests transformer learning from absolute simplest to complex: -0. Memorize single sequence (MUST work) -1. Pattern completion (A B A β†’ B) -2. Copy task (COPY: X β†’ X) -3. Simple arithmetic (2+3 β†’ 5) -4. TinyTalks greetings - -This helps identify EXACTLY where learning breaks down. -""" - -import sys -import os -import numpy as np -import time - -project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(project_root) - -from rich.console import Console -from rich.panel import Panel -from rich.table import Table -from rich import box - -console = Console() - - -def run_test_0_memorize_sequence(): - """ - TEST 0: Memorize Single Sequence - - The ABSOLUTE simplest test. Can the model memorize ONE sequence? - "HELLO WORLD" repeated many times. - - If this fails, there's a fundamental bug in: - - Forward pass - - Loss computation - - Backward pass - - Parameter updates - """ - console.print("\n" + "=" * 70) - console.print("[bold cyan]TEST 0: Single Sequence Memorization[/bold cyan]") - console.print("=" * 70) - console.print("Task: Memorize 'HELLO WORLD' (repeated 100 times)") - console.print("Expected: Loss should drop to near 0") - console.print("Why: If this fails, autograd/optimizer is broken\n") - - from tinytorch.core.tensor import Tensor - from tinytorch.core.optimizers import Adam - from tinytorch.core.losses import CrossEntropyLoss - from tinytorch.core.autograd import enable_autograd - from tinytorch.text.tokenization import CharTokenizer - from tinytorch.text.embeddings import Embedding, PositionalEncoding - from tinytorch.models.transformer import TransformerBlock, LayerNorm - from tinytorch.core.layers import Linear - - enable_autograd() - - # Super simple data: just repeat "HELLO WORLD" - text = "HELLO WORLD " * 100 - - # Tokenize - tokenizer = CharTokenizer() - tokenizer.build_vocab([text]) - data = tokenizer.encode(text) - - console.print(f"Data length: {len(data)} tokens") - console.print(f"Vocabulary: {tokenizer.vocab_size} chars") - console.print(f"Unique text: '{text[:50]}...'\n") - - # Tiny model - vocab_size = tokenizer.vocab_size - embed_dim = 32 - seq_len = 16 - - # Build minimal model - embedding = Embedding(vocab_size, embed_dim) - pos_enc = PositionalEncoding(seq_len, embed_dim) - transformer = TransformerBlock(embed_dim, num_heads=2, mlp_ratio=2, dropout_prob=0.1) - ln = LayerNorm(embed_dim) - output_proj = Linear(embed_dim, vocab_size) - - params = [] - params.extend(embedding.parameters()) - params.extend(pos_enc.parameters()) - params.extend(transformer.parameters()) - params.extend(ln.parameters()) - params.extend(output_proj.parameters()) - - for p in params: - p.requires_grad = True - - console.print(f"Model: {len(params)} parameter tensors") - console.print(f"Embed dim: {embed_dim}, Seq len: {seq_len}\n") - - # Train - optimizer = Adam(params, lr=0.01) - criterion = CrossEntropyLoss() - - console.print("[yellow]Training (10 steps)...[/yellow]") - console.print("[dim]Watching for: loss decrease, gradient flow, parameter updates[/dim]\n") - - initial_loss = None - final_loss = None - - for step in range(10): - # Random sequence - start = np.random.randint(0, len(data) - seq_len - 1) - input_seq = data[start:start+seq_len] - target_seq = data[start+1:start+seq_len+1] - - console.print(f"[dim]Step {step+1}:[/dim]", end=" ") - - # Forward - x = Tensor(np.array([input_seq])) - y = Tensor(np.array([target_seq])) - - console.print(f"input shape={x.shape}", end=" ") - - # Through model - x = embedding(x) - console.print(f"embed_out={x.shape}", end=" ") - - x = pos_enc(x) - console.print(f"pos_out={x.shape}", end=" ") - - x = transformer(x) - console.print(f"trans_out={x.shape}", end=" ") - - x = ln(x) - console.print(f"ln_out={x.shape}", end=" ") - - # Reshape - batch, seq, dim = x.shape - x_2d = x.reshape(batch * seq, dim) - logits_2d = output_proj(x_2d) - logits = logits_2d.reshape(batch, seq, vocab_size) - - console.print(f"logits={logits.shape}", end=" ") - - # Loss - logits_flat = logits.reshape(batch * seq, vocab_size) - targets_flat = y.reshape(-1) - - console.print(f"logits_flat={logits_flat.shape} targets_flat={targets_flat.shape}", end=" ") - - loss = criterion(logits_flat, targets_flat) - - loss_val = float(loss.data) - console.print(f"loss={loss_val:.4f}", end=" ") - - # Check if loss has grad_fn - has_grad_fn = hasattr(loss, '_grad_fn') and loss._grad_fn is not None - console.print(f"has_grad_fn={has_grad_fn}", end=" ") - - # Backward - optimizer.zero_grad() - - console.print("backward...", end=" ") - loss.backward() - - # Check if params got gradients - params_with_grad = sum(1 for p in params if p.grad is not None and np.any(p.grad != 0)) - console.print(f"params_w_grad={params_with_grad}/{len(params)}", end=" ") - - optimizer.step() - console.print("updated") - - if step == 0: - initial_loss = loss_val - console.print(f" [yellow]β†’ Initial loss: {initial_loss:.4f}[/yellow]") - if step == 9: - final_loss = loss_val - - if step % 2 == 0 and step > 0: - console.print(f" [cyan]β†’ Loss so far: {loss_val:.4f}[/cyan]") - - # Result - console.print(f"\n[bold]Results:[/bold]") - console.print(f" Initial loss: {initial_loss:.4f}") - console.print(f" Final loss: {final_loss:.4f}") - console.print(f" Decrease: {initial_loss - final_loss:.4f}") - - if final_loss < initial_loss * 0.8: - console.print(f" [green]βœ“ PASS: Loss decreased significantly[/green]") - return True - else: - console.print(f" [red]βœ— FAIL: Loss didn't decrease enough[/red]") - console.print(f" [red]β†’ Bug in: autograd, optimizer, or forward pass[/red]") - return False - - -def run_test_1_pattern_completion(): - """ - TEST 1: Pattern Completion - - Can it learn: "A B A B A B" β†’ next is "A" - "1 2 1 2 1 2" β†’ next is "1" - - Tests: Can model learn simple repeating patterns? - """ - console.print("\n" + "=" * 70) - console.print("[bold cyan]TEST 1: Pattern Completion[/bold cyan]") - console.print("=" * 70) - console.print("Task: Learn repeating patterns (ABAB... β†’ A, 1212... β†’ 1)") - console.print("Expected: Predict next token correctly after training") - console.print("Why: Tests if attention can learn simple sequences\n") - - from tinytorch.core.tensor import Tensor - from tinytorch.core.optimizers import Adam - from tinytorch.core.losses import CrossEntropyLoss - from tinytorch.text.embeddings import Embedding, PositionalEncoding - from tinytorch.models.transformer import TransformerBlock, LayerNorm - from tinytorch.core.layers import Linear - - # Create pattern data - patterns = [ - "A B A B A B A B A B ", - "1 2 1 2 1 2 1 2 1 2 ", - "X Y X Y X Y X Y X Y ", - ] - - text = "".join(patterns * 50) # Repeat 50 times - - console.print(f"Data: {len(text)} chars") - console.print(f"Patterns: ABAB, 1212, XYXY") - console.print(f"Sample: '{text[:40]}...'\n") - - # Tokenize - chars = sorted(set(text)) - vocab_size = len(chars) - char_to_idx = {ch: i for i, ch in enumerate(chars)} - idx_to_char = {i: ch for i, ch in enumerate(chars)} - data = np.array([char_to_idx[ch] for ch in text]) - - console.print(f"Vocab: {vocab_size} chars: {repr(''.join(chars))}\n") - - # Build tiny model - embed_dim = 32 - num_heads = 2 - seq_len = 8 - - embedding = Embedding(vocab_size, embed_dim) - pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim) - transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1) - ln = LayerNorm(embed_dim) - output_proj = Linear(embed_dim, vocab_size) - - params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters() - - # Set requires_grad - for p in params: - p.requires_grad = True - - optimizer = Adam(params, lr=0.01) - criterion = CrossEntropyLoss() - - console.print(f"[yellow]Training (30 steps on patterns)...[/yellow]") - - initial_loss = None - final_loss = None - - for step in range(30): - start = np.random.randint(0, len(data) - seq_len - 1) - input_seq = data[start:start+seq_len] - target_seq = data[start+1:start+seq_len+1] - - x = Tensor(np.array([input_seq])) - y = Tensor(np.array([target_seq])) - - x = embedding(x) - x = pos_enc(x) - x = transformer(x) - x = ln(x) - - batch, seq, dim = x.shape - x_2d = x.reshape(batch * seq, dim) - logits_2d = output_proj(x_2d) - logits = logits_2d.reshape(batch, seq, vocab_size) - - logits_flat = logits.reshape(batch * seq, vocab_size) - targets_flat = y.reshape(-1) - loss = criterion(logits_flat, targets_flat) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - loss_val = float(loss.data) - if step == 0: - initial_loss = loss_val - if step == 29: - final_loss = loss_val - - if step % 10 == 0 or step == 29: - console.print(f" Step {step+1}: Loss = {loss_val:.4f}") - - decrease = initial_loss - final_loss - console.print(f"\n[bold]Results:[/bold]") - console.print(f" Initial: {initial_loss:.4f}") - console.print(f" Final: {final_loss:.4f}") - console.print(f" Decrease: {decrease:.4f}") - - if decrease > 0.5: - console.print(f" [green]βœ“ PASS: Loss decreased significantly[/green]") - return True - else: - console.print(f" [red]βœ— FAIL: Loss didn't decrease enough[/red]") - return False - - -def run_test_2_copy_task(): - """ - TEST 2: Copy Task - - Input: "COPY: hello" - Output: "hello" - - Classic transformer test from research papers. - """ - console.print("\n" + "=" * 70) - console.print("[bold cyan]TEST 2: Copy Task[/bold cyan]") - console.print("=" * 70) - console.print("Task: COPY: X β†’ X (reproduce input)") - console.print("Expected: Model learns to copy the input text") - console.print("Why: Classic test of attention mechanism\n") - - from tinytorch.core.tensor import Tensor - from tinytorch.core.optimizers import Adam - from tinytorch.core.losses import CrossEntropyLoss - from tinytorch.text.embeddings import Embedding, PositionalEncoding - from tinytorch.models.transformer import TransformerBlock, LayerNorm - from tinytorch.core.layers import Linear - - # Create copy task data - words = ["hello", "world", "test", "copy", "learn", "task"] - examples = [] - for word in words: - examples.append(f"COPY:{word}={word} ") - - text = "".join(examples * 50) # Repeat - - console.print(f"Data: {len(text)} chars") - console.print(f"Examples: COPY:hello=hello, COPY:world=world") - console.print(f"Sample: '{text[:50]}...'\n") - - # Tokenize - chars = sorted(set(text)) - vocab_size = len(chars) - char_to_idx = {ch: i for i, ch in enumerate(chars)} - data = np.array([char_to_idx[ch] for ch in text]) - - console.print(f"Vocab: {vocab_size} chars\n") - - # Build model - embed_dim = 32 - num_heads = 2 - seq_len = 16 - - embedding = Embedding(vocab_size, embed_dim) - pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim) - transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1) - ln = LayerNorm(embed_dim) - output_proj = Linear(embed_dim, vocab_size) - - params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters() - for p in params: - p.requires_grad = True - - optimizer = Adam(params, lr=0.01) - criterion = CrossEntropyLoss() - - console.print(f"[yellow]Training (40 steps on copy task)...[/yellow]") - - initial_loss = None - final_loss = None - - for step in range(40): - start = np.random.randint(0, len(data) - seq_len - 1) - input_seq = data[start:start+seq_len] - target_seq = data[start+1:start+seq_len+1] - - x = Tensor(np.array([input_seq])) - y = Tensor(np.array([target_seq])) - - x = embedding(x) - x = pos_enc(x) - x = transformer(x) - x = ln(x) - - batch, seq, dim = x.shape - x_2d = x.reshape(batch * seq, dim) - logits_2d = output_proj(x_2d) - logits = logits_2d.reshape(batch, seq, vocab_size) - - logits_flat = logits.reshape(batch * seq, vocab_size) - targets_flat = y.reshape(-1) - loss = criterion(logits_flat, targets_flat) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - loss_val = float(loss.data) - if step == 0: - initial_loss = loss_val - if step == 39: - final_loss = loss_val - - if step % 10 == 0 or step == 39: - console.print(f" Step {step+1}: Loss = {loss_val:.4f}") - - decrease = initial_loss - final_loss - console.print(f"\n[bold]Results:[/bold]") - console.print(f" Initial: {initial_loss:.4f}") - console.print(f" Final: {final_loss:.4f}") - console.print(f" Decrease: {decrease:.4f}") - - if decrease > 0.5: - console.print(f" [green]βœ“ PASS: Loss decreased[/green]") - return True - else: - console.print(f" [red]βœ— FAIL: Loss didn't decrease enough[/red]") - return False - - -def run_test_3_simple_arithmetic(): - """ - TEST 3: Simple Arithmetic - - 2+3=5 - 1+1=2 - 5-2=3 - - Tests: Can model learn simple rules? - """ - console.print("\n" + "=" * 70) - console.print("[bold cyan]TEST 3: Simple Arithmetic[/bold cyan]") - console.print("=" * 70) - console.print("Task: 2+3=5, 1+1=2, etc. (single digit)") - console.print("Expected: Correct answers after training") - console.print("Why: Tests reasoning ability\n") - - from tinytorch.core.tensor import Tensor - from tinytorch.core.optimizers import Adam - from tinytorch.core.losses import CrossEntropyLoss - from tinytorch.text.embeddings import Embedding, PositionalEncoding - from tinytorch.models.transformer import TransformerBlock, LayerNorm - from tinytorch.core.layers import Linear - - # Create arithmetic data - examples = [] - for a in range(1, 6): - for b in range(1, 6): - examples.append(f"{a}+{b}={a+b} ") - - text = "".join(examples * 30) # Repeat - - console.print(f"Data: {len(text)} chars") - console.print(f"Examples: 1+1=2, 2+3=5, 4+5=9") - console.print(f"Sample: '{text[:40]}...'\n") - - # Tokenize - chars = sorted(set(text)) - vocab_size = len(chars) - char_to_idx = {ch: i for i, ch in enumerate(chars)} - data = np.array([char_to_idx[ch] for ch in text]) - - console.print(f"Vocab: {vocab_size} chars\n") - - # Build model - embed_dim = 48 - num_heads = 3 - seq_len = 12 - - embedding = Embedding(vocab_size, embed_dim) - pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim) - transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1) - ln = LayerNorm(embed_dim) - output_proj = Linear(embed_dim, vocab_size) - - params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters() - for p in params: - p.requires_grad = True - - optimizer = Adam(params, lr=0.01) - criterion = CrossEntropyLoss() - - console.print(f"[yellow]Training (50 steps on arithmetic)...[/yellow]") - - initial_loss = None - final_loss = None - - for step in range(50): - start = np.random.randint(0, len(data) - seq_len - 1) - input_seq = data[start:start+seq_len] - target_seq = data[start+1:start+seq_len+1] - - x = Tensor(np.array([input_seq])) - y = Tensor(np.array([target_seq])) - - x = embedding(x) - x = pos_enc(x) - x = transformer(x) - x = ln(x) - - batch, seq, dim = x.shape - x_2d = x.reshape(batch * seq, dim) - logits_2d = output_proj(x_2d) - logits = logits_2d.reshape(batch, seq, vocab_size) - - logits_flat = logits.reshape(batch * seq, vocab_size) - targets_flat = y.reshape(-1) - loss = criterion(logits_flat, targets_flat) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - loss_val = float(loss.data) - if step == 0: - initial_loss = loss_val - if step == 49: - final_loss = loss_val - - if step % 10 == 0 or step == 49: - console.print(f" Step {step+1}: Loss = {loss_val:.4f}") - - decrease = initial_loss - final_loss - console.print(f"\n[bold]Results:[/bold]") - console.print(f" Initial: {initial_loss:.4f}") - console.print(f" Final: {final_loss:.4f}") - console.print(f" Decrease: {decrease:.4f}") - - if decrease > 0.3: - console.print(f" [green]βœ“ PASS: Loss decreased[/green]") - console.print(f" [dim](arithmetic is harder, so lower threshold)[/dim]") - return True - else: - console.print(f" [red]βœ— FAIL: Loss didn't decrease enough[/red]") - return False - - -def run_test_4_tinytalks_level1(): - """ - TEST 4: TinyTalks Level 1 - - Q: Hello! - A: Hi there! - - The actual task we want to solve. - """ - console.print("\n" + "=" * 70) - console.print("[bold cyan]TEST 4: TinyTalks Level 1[/bold cyan]") - console.print("=" * 70) - console.print("Task: Learn greeting Q&A pairs from TinyTalks") - console.print("Expected: Can respond to greetings") - console.print("Why: The actual milestone goal\n") - - from tinytorch.core.tensor import Tensor - from tinytorch.core.optimizers import Adam - from tinytorch.core.losses import CrossEntropyLoss - from tinytorch.text.embeddings import Embedding, PositionalEncoding - from tinytorch.models.transformer import TransformerBlock, LayerNorm - from tinytorch.core.layers import Linear - - # Load TinyTalks Level 1 data - try: - with open("datasets/tinytalks/splits/train.txt", "r") as f: - full_text = f.read() - - # Heuristic: Level 1 = very short Q&A (< 40 chars each) - lines = full_text.split('\n') - level_1_text = [] - for i in range(0, len(lines) - 1, 3): # Q, A, blank - if i+1 < len(lines): - q_line = lines[i] - a_line = lines[i+1] - if q_line.startswith('Q:') and a_line.startswith('A:'): - if len(q_line) < 40 and len(a_line) < 40: - level_1_text.append(q_line + '\n' + a_line + '\n\n') - - if not level_1_text: - console.print("[red]No Level 1 data found, using first 10 Q&A[/red]") - level_1_text = [full_text[:500]] - - text = "".join(level_1_text[:10]) # First 10 simple Q&A - - console.print(f"Data: {len(text)} chars (Level 1 greetings)") - console.print(f"Sample:\n{text[:100]}...\n") - - except FileNotFoundError: - console.print("[red]TinyTalks not found, skipping Test 4[/red]") - return None - - # Tokenize - chars = sorted(set(text)) - vocab_size = len(chars) - char_to_idx = {ch: i for i, ch in enumerate(chars)} - data = np.array([char_to_idx[ch] for ch in text]) - - console.print(f"Vocab: {vocab_size} chars\n") - - # Build model (slightly larger for Q&A) - embed_dim = 64 - num_heads = 4 - seq_len = 32 - - embedding = Embedding(vocab_size, embed_dim) - pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim) - transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1) - ln = LayerNorm(embed_dim) - output_proj = Linear(embed_dim, vocab_size) - - params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters() - for p in params: - p.requires_grad = True - - optimizer = Adam(params, lr=0.005) # Lower LR for Q&A - criterion = CrossEntropyLoss() - - console.print(f"[yellow]Training (100 steps on TinyTalks Level 1)...[/yellow]") - - initial_loss = None - final_loss = None - - for step in range(100): - if len(data) < seq_len + 1: - console.print("[red]Dataset too small[/red]") - return None - - start = np.random.randint(0, len(data) - seq_len - 1) - input_seq = data[start:start+seq_len] - target_seq = data[start+1:start+seq_len+1] - - x = Tensor(np.array([input_seq])) - y = Tensor(np.array([target_seq])) - - x = embedding(x) - x = pos_enc(x) - x = transformer(x) - x = ln(x) - - batch, seq, dim = x.shape - x_2d = x.reshape(batch * seq, dim) - logits_2d = output_proj(x_2d) - logits = logits_2d.reshape(batch, seq, vocab_size) - - logits_flat = logits.reshape(batch * seq, vocab_size) - targets_flat = y.reshape(-1) - loss = criterion(logits_flat, targets_flat) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - loss_val = float(loss.data) - if step == 0: - initial_loss = loss_val - if step == 99: - final_loss = loss_val - - if step % 20 == 0 or step == 99: - console.print(f" Step {step+1}: Loss = {loss_val:.4f}") - - decrease = initial_loss - final_loss - console.print(f"\n[bold]Results:[/bold]") - console.print(f" Initial: {initial_loss:.4f}") - console.print(f" Final: {final_loss:.4f}") - console.print(f" Decrease: {decrease:.4f}") - - if decrease > 0.3: - console.print(f" [green]βœ“ PASS: Model is learning TinyTalks![/green]") - console.print(f" [cyan]β†’ Now train full model with tinytalks_gpt.py[/cyan]") - return True - else: - console.print(f" [yellow]⚠ PARTIAL: Some learning, may need more steps[/yellow]") - return False - - -def main(): - """Run all tests in sequence""" - console.print("\n") - console.print(Panel( - "[bold cyan]TinyGPT Learning Diagnostic Suite[/bold cyan]\n\n" - "Progressive tests from simplest to complex:\n" - " 0. Single sequence memorization (MUST work)\n" - " 1. Pattern completion (A B A β†’ B)\n" - " 2. Copy task (COPY: X β†’ X)\n" - " 3. Simple arithmetic (2+3 β†’ 5)\n" - " 4. TinyTalks greetings (Q&A)\n\n" - "[yellow]This identifies EXACTLY where learning breaks down[/yellow]", - title="πŸ”¬ Diagnostic Tests", - border_style="cyan", - box=box.DOUBLE - )) - - results = {} - - # Run tests - try: - results[0] = run_test_0_memorize_sequence() - except Exception as e: - console.print(f"\n[red]Test 0 crashed: {str(e)}[/red]") - results[0] = False - - # Only run next tests if previous passed - if results.get(0): - results[1] = run_test_1_pattern_completion() - results[2] = run_test_2_copy_task() - results[3] = run_test_3_simple_arithmetic() - results[4] = run_test_4_tinytalks_level1() - - # Summary - console.print("\n" + "=" * 70) - console.print("[bold]Test Summary:[/bold]") - console.print("=" * 70) - - for test_num, result in results.items(): - if result is True: - console.print(f" Test {test_num}: [green]βœ“ PASS[/green]") - elif result is False: - console.print(f" Test {test_num}: [red]βœ— FAIL[/red]") - else: - console.print(f" Test {test_num}: [yellow]β—‹ TODO[/yellow]") - - console.print("\n" + "=" * 70) - - if results.get(0) is False: - console.print("[bold red]CRITICAL: Test 0 failed![/bold red]") - console.print("The transformer cannot even memorize a single sequence.") - console.print("This indicates a fundamental bug in:") - console.print(" - Forward pass computation") - console.print(" - Autograd backward pass") - console.print(" - Optimizer parameter updates") - console.print(" - Loss computation") - - -if __name__ == "__main__": - main() - diff --git a/milestones/05_2017_transformer/test_tinytalks_learning.py b/milestones/05_2017_transformer/test_tinytalks_learning.py deleted file mode 100644 index b3f51d3f..00000000 --- a/milestones/05_2017_transformer/test_tinytalks_learning.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -""" -Quick diagnostic to test if the model can learn ANY pattern at all. -""" - -import sys -import os -import numpy as np - -project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(project_root) - -from tinytorch.core.tensor import Tensor -from tinytorch.core.optimizers import Adam -from tinytorch.core.losses import CrossEntropyLoss -from tinytorch.core.autograd import enable_autograd -from tinytorch.text.tokenization import CharTokenizer - -# Enable autograd -enable_autograd() - -# Super simple test: Can the model learn to predict "A" after "Q:"? -test_data = """Q: Hello! -A: Hi there! - -Q: What is your name? -A: I am TinyBot. - -Q: What color is the sky? -A: The sky is blue. -""" - -print("Testing if model can learn simple patterns...") -print(f"Test data: {repr(test_data[:100])}...") - -# Build tokenizer -tokenizer = CharTokenizer() -tokenizer.build_vocab([test_data]) -tokens = tokenizer.encode(test_data) - -print(f"Vocabulary size: {tokenizer.vocab_size}") -print(f"Total tokens: {len(tokens)}") -print(f"First 20 tokens: {tokens[:20]}") -print(f"Decoded: {repr(tokenizer.decode(tokens[:20]))}") - -# Check specific patterns -q_colon_tokens = tokenizer.encode("Q:") -print(f"\n'Q:' tokens: {q_colon_tokens}") -print(f"'Q:' decoded: {repr(tokenizer.decode(q_colon_tokens))}") - -a_colon_tokens = tokenizer.encode("A:") -print(f"'A:' tokens: {a_colon_tokens}") -print(f"'A:' decoded: {repr(tokenizer.decode(a_colon_tokens))}") - -# Find all occurrences of "Q:" followed by space/newline then "A:" -print("\nPattern analysis:") -text_str = test_data -q_count = text_str.count("Q:") -a_count = text_str.count("A:") -print(f"'Q:' appears: {q_count} times") -print(f"'A:' appears: {a_count} times") - -print("\nβœ… Tokenizer is working correctly!") -print("\nConclusion: The model should be able to learn that 'A:' follows 'Q:'") -print("If it's generating garbage, the model is either:") -print(" 1. Too small (need more parameters)") -print(" 2. Not trained enough (need more epochs)") -print(" 3. Learning rate is wrong") -print(" 4. Or there's a bug in the training loop") - diff --git a/milestones/05_2017_transformer/tinystories_gpt.py b/milestones/05_2017_transformer/tinystories_gpt.py deleted file mode 100644 index b08b0ee4..00000000 --- a/milestones/05_2017_transformer/tinystories_gpt.py +++ /dev/null @@ -1,604 +0,0 @@ -#!/usr/bin/env python3 -""" -TinyStories Text Generation (2017) - Transformer Era -==================================================== - -πŸ“š HISTORICAL CONTEXT: -In 2017, Vaswani et al. published "Attention Is All You Need", showing that -attention mechanisms alone (no RNNs!) could achieve state-of-the-art results -on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs. - -🎯 WHAT YOU'RE BUILDING: -Using YOUR TinyTorch implementations, you'll build a character-level language model -that generates simple stories - proving YOUR attention mechanism works! - -TinyStories is MUCH EASIER than Shakespeare: -- Simple vocabulary (children's stories vs archaic English) -- Clear sentence structure -- Designed specifically for small models like ours! -- Faster convergence and better results - -βœ… REQUIRED MODULES (Run after Module 13): -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - Module 02 (Tensor) : YOUR data structure with autograd - Module 03 (Activations) : YOUR ReLU in feed-forward networks - Module 04 (Layers) : YOUR Linear layers - Module 08 (Optimizers) : YOUR Adam optimizer - Module 10 (Tokenization) : YOUR CharTokenizer for textβ†’numbers - Module 11 (Embeddings) : YOUR token & positional embeddings - Module 12 (Attention) : YOUR multi-head self-attention - Module 13 (Transformers) : YOUR LayerNorm + TransformerBlock -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -πŸ—οΈ ARCHITECTURE (Character-Level Language Model): - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Output Predictions β”‚ - β”‚ Character Probabilities (vocab_size) β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Output Projection β”‚ - β”‚ Module 04: vectors β†’ vocabulary β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Layer Norm β”‚ - β”‚ Module 13: Final normalization β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - ╔══════════════════════════════════════════════════════════════════════════════╗ - β•‘ Transformer Block Γ— N (Repeat) β•‘ - β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ - β•‘ β”‚ Feed Forward Network β”‚ β•‘ - β•‘ β”‚ Module 04: Linear β†’ ReLU β†’ Linear β”‚ β•‘ - β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ - β•‘ β–² β•‘ - β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ - β•‘ β”‚ Multi-Head Self-Attention β”‚ β•‘ - β•‘ β”‚ Module 12: QueryΒ·Key^TΒ·Value across all positions β”‚ β•‘ - β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ - β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Positional Encoding β”‚ - β”‚ Module 11: Add position information β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Character Embeddings β”‚ - β”‚ Module 11: chars β†’ embed_dim vectors β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Input Characters β”‚ - β”‚ "To be or not to be, that is..." β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - -πŸ“Š EXPECTED PERFORMANCE: -- Dataset: ~21MB TinyStories validation set (simple children's stories) -- Training time: 30-45 minutes (proper training, faster than Shakespeare!) -- Vocabulary: ~90 unique characters (simple English) -- Expected: Coherent simple stories with proper grammar -- Parameters: ~4.8M (perfect size for this task) -""" - -import sys -import os -import numpy as np -import argparse -import time -from rich.console import Console -from rich.panel import Panel -from rich.table import Table -from rich import box - -# Add project root to path -project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(project_root) - -console = Console() - -# Import TinyTorch components YOU BUILT! -from tinytorch.core.tensor import Tensor # Module 02: YOU built this! -from tinytorch.core.layers import Linear # Module 04: YOU built this! -from tinytorch.core.activations import ReLU, Softmax # Module 03: YOU built this! -from tinytorch.core.optimizers import Adam # Module 08: YOU built this! -from tinytorch.core.losses import CrossEntropyLoss # Module 04: YOU built this! -from tinytorch.text.tokenization import CharTokenizer # Module 10: YOU built this! -from tinytorch.text.embeddings import Embedding, PositionalEncoding # Module 11: YOU built this! -from tinytorch.core.attention import MultiHeadAttention # Module 12: YOU built this! -from tinytorch.models.transformer import LayerNorm, TransformerBlock # Module 13: YOU built this! -from tinytorch.data.loader import DataLoader, Dataset # Module 08: YOU built this! - -# Import dataset manager -from data_manager import DatasetManager - - -class TinyStoriesDataset(Dataset): - """ - Character-level TinyStories dataset using YOUR Dataset interface (Module 08) - and YOUR CharTokenizer (Module 10)! - - Tokenizes simple children's stories into characters for language modeling. - Much easier to learn than Shakespeare! - """ - - def __init__(self, text, seq_length=64): - """ - Initialize dataset with text and sequence length. - - Args: - text: Raw Shakespeare text - seq_length: Length of input sequences - """ - # Use YOUR CharTokenizer from Module 10! - self.tokenizer = CharTokenizer() - self.tokenizer.build_vocab([text]) # Build vocabulary from Shakespeare corpus - self.vocab_size = self.tokenizer.vocab_size - - # Convert text to indices using YOUR tokenizer! - self.data = self.tokenizer.encode(text) - self.seq_length = seq_length - - # Calculate number of sequences - self.num_sequences = len(self.data) - seq_length - - def __getitem__(self, idx): - """Get a single training sequence - YOUR Dataset interface!""" - # Input: characters at positions [idx, idx+seq_length) - # Target: characters at positions [idx+1, idx+seq_length+1) - input_seq = self.data[idx:idx + self.seq_length] - target_seq = self.data[idx + 1:idx + self.seq_length + 1] - - return Tensor(np.array(input_seq, dtype=np.int32)), Tensor(np.array(target_seq, dtype=np.int32)) - - def __len__(self): - """Return dataset size - YOUR Dataset interface!""" - return self.num_sequences - - def decode(self, indices): - """Convert indices back to text using YOUR tokenizer!""" - return self.tokenizer.decode(indices) - - -class TinyGPT: - """ - Character-level Transformer Language Model using YOUR TinyTorch! - - This architecture is what powers GPT, ChatGPT, and modern LLMs. - """ - - def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers): - # Token representation - self.embedding = Embedding(vocab_size, embed_dim) # Module 11! - self.pos_encoding = PositionalEncoding(max_length, embed_dim) # Module 11! - - # Transformer stack - self.layers = [] - mlp_ratio = 4 # Standard 4x expansion in FFN (embed_dim * 4) - for _ in range(num_layers): - block = TransformerBlock(embed_dim, num_heads, mlp_ratio) # Module 13! - self.layers.append(block) - - # Output head - self.layer_norm = LayerNorm(embed_dim) # Module 13! - self.output_proj = Linear(embed_dim, vocab_size) # Module 04! - - self.vocab_size = vocab_size - self.embed_dim = embed_dim - self.num_layers = num_layers - self.num_heads = num_heads - - # Calculate parameters - self.total_params = self._count_parameters() - - def _count_parameters(self): - """Count total parameters in model.""" - count = 0 - for param in self.parameters(): - count += param.data.size - return count - - def parameters(self): - """Get all trainable parameters from YOUR model.""" - params = [] - # Embedding parameters - params.extend([self.embedding.weight]) - params.extend(self.pos_encoding.parameters()) # Add positional encoding params! - # Transformer block parameters - for layer in self.layers: - if hasattr(layer, 'parameters'): - if callable(layer.parameters): - params.extend(layer.parameters()) - else: - params.extend(layer.parameters) - # Output projection parameters - params.extend([self.layer_norm.gamma, self.layer_norm.beta]) - params.extend([self.output_proj.weight, self.output_proj.bias]) - - # Ensure all parameters have requires_grad=True - for param in params: - param.requires_grad = True - - return params - - def forward(self, x): - """Forward pass through YOUR transformer stack.""" - # Convert tokens to contextual vectors - x = self.embedding.forward(x) # Module 11: char β†’ vectors - x = self.pos_encoding.forward(x) # Module 11: add position info - - # Process through transformer layers - for layer in self.layers: - x = layer.forward(x) # Module 13: Attention β†’ FFN - - # Generate predictions - x = self.layer_norm.forward(x) # Module 13: final norm - - # Reshape for Linear layer - KEEP COMPUTATION GRAPH! - batch_size, seq_len, embed_dim = x.shape - x_2d = x.reshape(batch_size * seq_len, embed_dim) # Use Tensor.reshape() - - # Apply output projection - logits_2d = self.output_proj(x_2d) # Module 04: vocab predictions - - # Reshape back - KEEP COMPUTATION GRAPH! - logits = logits_2d.reshape(batch_size, seq_len, self.vocab_size) # Use Tensor.reshape() - - return logits - - -def visualize_transformer(): - """Show how transformers process text sequences.""" - console.print("") - console.print(Panel.fit( - "[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n" - "[yellow]The Problem:[/yellow]\n" - "RNNs process sequences one step at a time\n" - "Can't parallelize β†’ slow training on long sequences\n" - "Struggle with long-range dependencies\n\n" - "[green]The Innovation:[/green]\n" - "Transformers: Attention mechanisms process ENTIRE sequences in parallel\n" - " β€’ Self-attention: Every token attends to every other token\n" - " β€’ Multi-head attention: Learn multiple attention patterns\n" - " β€’ Positional encoding: Preserve sequence order\n\n" - "[bold]Can attention alone match RNN performance?[/bold]", - title="🎯 ACT 1: THE CHALLENGE", - border_style="cyan", - box=box.DOUBLE - )) - - console.print(""" - How YOUR Transformer Sees Text: What It Learns: - - Input: "To be or not to be" Layer 1 (Attention): - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β€’ Each word attends to others - β”‚ T o b e o r ... β”‚ β€’ "be" looks at "To", "or", etc. - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β€’ Captures dependencies - ↓ - Character Embeddings Layer 2-4 (Deep Attention): - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β€’ Builds complex patterns - β”‚ 128-dim vectors β”‚ β€’ Grammar, style, meaning - β”‚ for each character β”‚ β€’ Shakespeare-specific patterns - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - ↓ Output Prediction: - Position Encoding "To be or not to be, that is the" - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” ↓ - β”‚ Add positional info β”‚ Next char probabilities: - β”‚ (order matters!) β”‚ 't' β†’ 0.85 (highest!) - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ 'n' β†’ 0.03 - ↓ 'a' β†’ 0.02 - Transformer Layers Γ—4 ... - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Self-Attention β”‚ Key Transformer Insight: - β”‚ Feed-Forward β”‚ Unlike RNNs, attention lets each - β”‚ Layer Norm β”‚ position look at ALL others - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ simultaneously - capturing long-range - ↓ dependencies in O(1) operations! - Character Predictions - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Probability for β”‚ - β”‚ each next character β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - """) - print("="*70) - - -def train_tinystories_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.01): - """Train TinyGPT using YOUR complete training system with DataLoader!""" - console.print("\n[bold]πŸš€ Training TinyStories TinyGPT with YOUR TinyTorch![/bold]") - console.print(f" Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences") - console.print(f" Batch size: [cyan]{train_loader.batch_size}[/cyan]") - console.print(f" Learning rate: [cyan]{learning_rate}[/cyan] (1e-2, optimal for 4.8M param model)") - console.print(f" YOUR DataLoader (Module 08) handles batching!") - console.print(f" YOUR Adam optimizer (Module 08)") - console.print(f" YOUR CrossEntropyLoss (Module 04) with autograd!") - - # YOUR optimizer and loss function - # Using 1e-2 learning rate (optimal for our 4.8M param model, validated by debug script) - # Note: Large models (100M+) use 3e-4, but smaller models need higher LR - optimizer = Adam(model.parameters(), lr=learning_rate) - loss_fn = CrossEntropyLoss() # YOUR loss function with autograd! - - for epoch in range(epochs): - console.print(f"\n [bold]Epoch {epoch+1}/{epochs}:[/bold]") - epoch_loss = 0 - batch_count = 0 - - # Use YOUR DataLoader to iterate through batches! - for batch_idx, (batch_input, batch_target) in enumerate(train_loader): - if batch_idx >= 500: # Training mode - process more batches - break - - if batch_idx == 0: - console.print(f" [dim]Processing first batch... (this may take a moment)[/dim]") - - # Forward pass with YOUR Transformer - logits = model(batch_input) # YOUR attention mechanism! - - # Reshape for loss computation: (batch, seq, vocab) -> (batch*seq, vocab) - # IMPORTANT: Use Tensor.reshape() to preserve computation graph! - batch_size, seq_length, vocab_size = logits.shape - logits_2d = logits.reshape(batch_size * seq_length, vocab_size) - targets_1d = batch_target.reshape(-1) - - # Compute loss with YOUR CrossEntropyLoss (connects to autograd!) - loss = loss_fn.forward(logits_2d, targets_1d) # Module 04 + Module 05! - loss_value = float(loss.data) - - # Backward pass with YOUR autograd - optimizer.zero_grad() # Module 08! - loss.backward() # Module 05: YOUR autodiff! - optimizer.step() # Module 08! - - epoch_loss += loss_value - batch_count += 1 - - # Progress - show output frequently so user sees continuous training - if batch_idx == 0 or (batch_idx + 1) % 10 == 0 or (batch_idx + 1) % 50 == 0: - avg_loss = epoch_loss / batch_count - console.print(f" Batch {batch_idx+1}/500 | Loss: {loss_value:.4f} | Avg: {avg_loss:.4f}") - - # Epoch summary - avg_loss = epoch_loss / max(1, batch_count) - console.print(f" β†’ Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)") - - return model - - -def generate_text(model, dataset, prompt="To be or not", max_length=200, temperature=0.8): - """ - Generate text from a prompt - THE WOW MOMENT! - - This is autoregressive generation: predict next char, add it, repeat. - """ - console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]") - console.print("="*70) - - # Convert prompt to indices - prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx] - generated = prompt_indices.copy() - - console.print(f"πŸ“ Prompt: [cyan]\"{prompt}\"[/cyan]") - console.print(f"🎯 Generating [cyan]{max_length}[/cyan] characters...\n") - - # Generate character by character - for _ in range(max_length): - # Take last seq_length characters as input - input_seq = generated[-dataset.seq_length:] if len(generated) >= dataset.seq_length else generated - - # Pad if necessary - if len(input_seq) < dataset.seq_length: - input_seq = [0] * (dataset.seq_length - len(input_seq)) + input_seq - - # Forward pass - input_tensor = Tensor(np.array([input_seq], dtype=np.int32)) - logits = model(input_tensor) - - # Get logits for last position - logits_np = np.array(logits.data.data if hasattr(logits.data, 'data') else logits.data) - next_logits = logits_np[0, -1, :] # Last position predictions - - # Apply temperature and sample - next_logits = next_logits / temperature - exp_logits = np.exp(next_logits - np.max(next_logits)) - probs = exp_logits / np.sum(exp_logits) - - # Sample from distribution - next_idx = np.random.choice(len(probs), p=probs) - generated.append(next_idx) - - # Decode to text - generated_text = dataset.decode(generated) - - console.print("[bold]πŸ“– Generated Text:[/bold]") - console.print("─" * 70) - console.print(f"[green]{generated_text}[/green]") - console.print("─" * 70) - - return generated_text - - -def analyze_transformer_systems(model): - """Analyze YOUR Transformer from an ML systems perspective.""" - console.print("") - console.print(Panel.fit( - f"[bold]Model Architecture:[/bold]\n" - f" β€’ Parameters: [cyan]{model.total_params:,}[/cyan] weights\n" - f" β€’ Embedding dim: [cyan]{model.embed_dim}[/cyan]\n" - f" β€’ Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n" - - "[bold]Computational Complexity:[/bold]\n" - " β€’ Attention: O(nΒ²Β·d) where n=sequence, d=dimension\n" - " β€’ Self-attention allows parallel processing (vs RNN sequential)\n" - " β€’ YOUR implementation: Pure Python + NumPy\n\n" - - f"[bold]Memory Requirements:[/bold]\n" - f" β€’ Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n" - " β€’ Attention matrices: O(nΒ²) per layer\n" - " β€’ YOUR TinyTorch tracks gradients automatically\n\n" - - "[bold]πŸ›οΈ Transformer Evolution:[/bold]\n" - " β€’ 2017: Vaswani et al. 'Attention Is All You Need'\n" - " β€’ 2018: BERT (bidirectional), GPT (autoregressive)\n" - " β€’ 2020: GPT-3 (175B params, same architecture!)\n" - " β€’ 2022: ChatGPT (YOUR architecture at massive scale)\n" - " β€’ YOUR TinyGPT: Core principles that power them all!\n\n" - - "[bold]πŸ’‘ Why Transformers Dominate:[/bold]\n" - " β€’ Parallelizable (vs sequential RNNs)\n" - " β€’ Long-range dependencies (attention sees everything)\n" - " β€’ Scalable (architecture works from 1M to 175B params)\n" - " β€’ YOUR implementation demonstrates all of these!", - - title="πŸ”¬ SYSTEMS ANALYSIS", - border_style="cyan", - box=box.DOUBLE - )) - - -def main(): - """Demonstrate Shakespeare text generation using YOUR TinyTorch!""" - - parser = argparse.ArgumentParser(description='Shakespeare Transformer 2017') - parser.add_argument('--test-only', action='store_true', - help='Test architecture only') - parser.add_argument('--epochs', type=int, default=20, - help='Training epochs') - parser.add_argument('--batch-size', type=int, default=32, - help='Batch size') - parser.add_argument('--seq-length', type=int, default=128, - help='Sequence length') - parser.add_argument('--embed-dim', type=int, default=256, - help='Embedding dimension') - parser.add_argument('--num-layers', type=int, default=6, - help='Number of transformer layers') - parser.add_argument('--num-heads', type=int, default=8, - help='Number of attention heads') - parser.add_argument('--visualize', action='store_true', default=True, - help='Show transformer visualization') - parser.add_argument('--quick-test', action='store_true', - help='Use small subset for testing') - args = parser.parse_args() - - console.print("") - console.print(Panel.fit( - "[bold cyan]TinyStories Transformer - Simple Story Generation![/bold cyan]\n\n" - "[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n" - "[green]YOUR achievement:[/green] Generate coherent children's stories\n" - "[cyan]Components used:[/cyan] YOUR complete NLP pipeline (Modules 2, 3, 4, 8, 10, 11, 12, 13)\n" - "[dim]Note: TinyStories is much easier than Shakespeare - designed for small models![/dim]", - title="🎯 Milestone 05: Transformer Era (2017)", - border_style="cyan", - box=box.DOUBLE - )) - - # Visualization - if args.visualize: - visualize_transformer() - - # Step 1: Load TinyStories dataset - console.print("\n[bold]πŸ“₯ Loading TinyStories dataset...[/bold]") - - # Load TinyStories from downloaded file - tinystories_path = os.path.join( - os.path.dirname(__file__), - '../datasets/tinystories/tinystories_val.txt' - ) - - if not os.path.exists(tinystories_path): - console.print(f"[red]❌ TinyStories not found at {tinystories_path}[/red]") - console.print("[yellow]Run: python milestones/05_2017_transformer/download_tinystories.py[/yellow]") - return - - with open(tinystories_path, 'r', encoding='utf-8') as f: - text = f.read() - - console.print(f"πŸ“Š Loaded: {len(text):,} characters, {len(text.split()):,} words") - - if args.quick_test: - text = text[:100000] # Use small subset for testing (100K chars) - console.print(" [dim](Using 100K char subset for quick testing)[/dim]") - - # Step 2: Create Dataset and DataLoader using YOUR Module 08! - console.print(f"\n[bold]πŸ“¦ Creating YOUR Dataset and DataLoader (Module 08)...[/bold]") - dataset = TinyStoriesDataset(text, seq_length=args.seq_length) - - # YOUR DataLoader handles batching and shuffling! - train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) - - console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters") - console.print(f" Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]") - console.print(f" DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]") - - # Step 3: Build Transformer - model = TinyGPT( - vocab_size=dataset.vocab_size, - embed_dim=args.embed_dim, - max_length=args.seq_length, - num_heads=args.num_heads, - num_layers=args.num_layers - ) - - # Display model info - console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]") - console.print(f" Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings") - console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters") - console.print(f" Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)") - - if args.test_only: - console.print("\n[bold yellow]πŸ§ͺ ARCHITECTURE TEST MODE[/bold yellow]") - # Test with minimal data - test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32)) - test_output = model(test_input) - console.print(f"[green]βœ… Forward pass successful! Output shape: {test_output.data.shape}[/green]") - console.print(f"[green]βœ… YOUR Transformer + DataLoader work together![/green]") - return - - # Step 4: Train using YOUR DataLoader - start_time = time.time() - model = train_tinystories_gpt(model, train_loader, dataset, epochs=args.epochs) - train_time = time.time() - start_time - - # Step 5: Generate text! - generated = generate_text(model, dataset, prompt="Once upon a time", max_length=200) - - # Additional generation examples - console.print("\n[bold]🎭 More Generation Examples:[/bold]") - console.print("─" * 70) - - prompts = ["ROMEO:", "The king", "What is"] - for prompt in prompts: - if all(ch in dataset.char_to_idx for ch in prompt): - console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]") - gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8) - - # Step 6: Systems Analysis - analyze_transformer_systems(model) - - console.print(f"\n[bold]⏱️ Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds") - console.print(f" Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]") - - console.print("") - console.print(Panel.fit( - "[bold green]βœ… SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n" - - "[bold]πŸŽ“ What YOU Accomplished:[/bold]\n" - " β€’ YOUR attention mechanism processes sequences in parallel\n" - " β€’ YOUR transformer captures long-range text dependencies\n" - " β€’ YOUR DataLoader efficiently batches character sequences\n" - " β€’ YOUR TinyGPT generates coherent text!\n" - " β€’ YOUR complete language modeling system works!\n\n" - - "[bold]πŸš€ Next Steps:[/bold]\n" - " β€’ Continue to Module 14 (KV-Caching) for 3x faster inference\n" - " β€’ YOUR transformer architecture scales to GPT-scale models\n" - " β€’ This is the foundation of ChatGPT, GPT-4, and all modern LLMs!", - - title="🌟 2017 Transformer Revolution Complete", - border_style="green", - box=box.DOUBLE - )) - -if __name__ == "__main__": - main() diff --git a/milestones/05_2017_transformer/tinytalks_chatbot.py b/milestones/05_2017_transformer/tinytalks_chatbot.py deleted file mode 100644 index b88aee1a..00000000 --- a/milestones/05_2017_transformer/tinytalks_chatbot.py +++ /dev/null @@ -1,375 +0,0 @@ -""" -TinyTalks Chatbot - Train a Simple Conversational AI in 10-15 Minutes -====================================================================== - -A minimal but functional chatbot trained on simple Q&A pairs. - -Goal: Show that transformers can learn conversational patterns quickly! -""" - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent.parent)) - -import numpy as np -import time -from tinytorch.core.tensor import Tensor -from tinytorch.core.autograd import enable_autograd -from tinytorch.core.optimizers import Adam -from tinytorch.core.losses import CrossEntropyLoss -from tinytorch.models.transformer import GPT -from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats - -enable_autograd() - -# ============================================================================ -# Tokenization -# ============================================================================ - -def create_tokenizer(conversations): - """Create character-level tokenizer with special tokens.""" - # Get all unique characters - all_text = ' '.join([q + ' ' + a for q, a in conversations]) - all_chars = sorted(set(all_text)) - - # Special tokens - special_tokens = { - '': 0, - '': 1, # Start of sequence - '': 2, # Separator between Q and A - '': 3, # End of sequence - } - - # Character mappings - char_to_idx = {**special_tokens} - idx_to_char = {v: k for k, v in special_tokens.items()} - - for idx, char in enumerate(all_chars, start=len(special_tokens)): - char_to_idx[char] = idx - idx_to_char[idx] = char - - return char_to_idx, idx_to_char - - -def encode_conversation(question, answer, char_to_idx, max_len=80): - """ - Encode Q&A pair as: question answer ... - - Example: - Q: "Hi" - A: "Hello" - β†’ [, H, i, , H, e, l, l, o, , , ...] - """ - # Build sequence - tokens = [char_to_idx['']] - - # Add question - for c in question: - tokens.append(char_to_idx.get(c, 0)) - - # Add separator - tokens.append(char_to_idx['']) - - # Add answer - for c in answer: - tokens.append(char_to_idx.get(c, 0)) - - # Add EOS - tokens.append(char_to_idx['']) - - # Pad - if len(tokens) < max_len: - tokens = tokens + [char_to_idx['']] * (max_len - len(tokens)) - else: - tokens = tokens[:max_len] - - return tokens - - -def decode_tokens(tokens, idx_to_char, stop_at_eos=True): - """Decode tokens to string.""" - chars = [] - for t in tokens: - if t == 0: # PAD - if stop_at_eos: - break - elif t == 1: # SOS - continue - elif t == 2: # SEP - chars.append(' | ') - elif t == 3: # EOS - if stop_at_eos: - break - else: - chars.append(idx_to_char.get(t, '?')) - return ''.join(chars) - - -# ============================================================================ -# Training -# ============================================================================ - -def train_chatbot(model, optimizer, loss_fn, train_data, max_time_minutes=10): - """ - Train TinyTalks chatbot. - """ - max_time_seconds = max_time_minutes * 60 - - print("=" * 70) - print(f"TRAINING TINYTALKS CHATBOT FOR {max_time_minutes} MINUTES") - print("=" * 70) - print(f"Dataset: {len(train_data)} conversations") - print(f"Time limit: {max_time_seconds}s ({max_time_minutes} minutes)") - print() - - start_time = time.time() - losses = [] - step = 0 - - # Progress checkpoints every 2 minutes - checkpoint_interval = 120 # 2 minutes - next_checkpoint = checkpoint_interval - - print("Training started...") - print() - - while True: - elapsed = time.time() - start_time - if elapsed >= max_time_seconds: - break - - # Sample random conversation - tokens = train_data[np.random.randint(len(train_data))] - - # Next token prediction - input_seq = tokens[:-1] - target_seq = tokens[1:] - - x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False) - y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False) - - # Forward - logits = model.forward(x) - - # Loss - batch_size, seq_len, vocab_size = logits.shape - logits_flat = logits.reshape(batch_size * seq_len, vocab_size) - targets_flat = y_true.reshape(batch_size * seq_len) - loss = loss_fn.forward(logits_flat, targets_flat) - - # Backward - optimizer.zero_grad() - loss.backward() - - # Clip gradients - for param in model.parameters(): - if param.grad is not None: - np.clip(param.grad, -1.0, 1.0, out=param.grad) - - # Update - optimizer.step() - - losses.append(loss.data.item()) - step += 1 - - # Show progress at checkpoints - if elapsed >= next_checkpoint: - avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - steps_per_sec = step / elapsed - mins = int(elapsed / 60) - print(f"[{mins:2d} min] Step {step:5d} | Loss: {avg_loss:.4f} | Speed: {steps_per_sec:.1f} steps/sec") - next_checkpoint += checkpoint_interval - - # Also show every 500 steps for early progress - if step % 500 == 0 and step <= 2000: - avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - print(f"[{int(elapsed):4d}s] Step {step:5d} | Loss: {avg_loss:.4f}") - - final_elapsed = time.time() - start_time - final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - initial_loss = np.mean(losses[:10]) - improvement = (1 - final_loss / initial_loss) * 100 - - print() - print("=" * 70) - print("TRAINING COMPLETE") - print("=" * 70) - print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.1f} minutes)") - print(f"Total steps: {step:,}") - print(f"Steps/second: {step/final_elapsed:.1f}") - print(f"Initial loss: {initial_loss:.4f}") - print(f"Final loss: {final_loss:.4f}") - print(f"Improvement: {improvement:.1f}%") - print() - - return losses, step - - -# ============================================================================ -# Generation / Chat -# ============================================================================ - -def generate_response(model, question, char_to_idx, idx_to_char, max_len=50): - """ - Generate response to a question. - - Process: - 1. Encode: question - 2. Generate tokens until or max_len - 3. Decode generated tokens - """ - # Encode question - tokens = [char_to_idx['']] - for c in question: - tokens.append(char_to_idx.get(c, 0)) - tokens.append(char_to_idx['']) - - # Generate response - generated_tokens = [] - for _ in range(max_len): - # Pad input to model's expected length - input_tokens = tokens + generated_tokens - while len(input_tokens) < 80: # Match training max_len - input_tokens.append(char_to_idx['']) - input_tokens = input_tokens[:80] - - # Forward pass - x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False) - logits = model.forward(x) - - # Get next token (position after current sequence) - next_pos = len(tokens) + len(generated_tokens) - 1 - if next_pos < logits.shape[1]: - next_logits = logits.data[0, next_pos, :] - next_token = int(np.argmax(next_logits)) - - # Stop at EOS or PAD - if next_token == char_to_idx[''] or next_token == char_to_idx['']: - break - - generated_tokens.append(next_token) - else: - break - - # Decode generated response - response = decode_tokens(generated_tokens, idx_to_char, stop_at_eos=False) - return response - - -def test_chatbot(model, test_questions, char_to_idx, idx_to_char): - """Test chatbot on sample questions.""" - print("=" * 70) - print("TESTING CHATBOT") - print("=" * 70) - print() - - for question in test_questions: - response = generate_response(model, question, char_to_idx, idx_to_char) - print(f"Q: {question}") - print(f"A: {response}") - print() - - -# ============================================================================ -# Main -# ============================================================================ - -def main(): - print() - print("=" * 70) - print("TINYTALKS CHATBOT - 10-15 MINUTE TRAINING") - print("=" * 70) - print() - - # Load dataset - conversations = create_tinytalks_dataset() - stats = get_dataset_stats() - - print(f"Dataset: {stats['total_examples']} examples ({stats['unique_examples']} unique)") - print(f"Repetition: {stats['repetition_factor']:.1f}x for better learning") - print(f"Avg lengths: Q={stats['avg_question_len']:.1f} chars, A={stats['avg_answer_len']:.1f} chars") - print() - - # Create tokenizer - char_to_idx, idx_to_char = create_tokenizer(conversations) - vocab_size = len(idx_to_char) - print(f"Vocabulary: {vocab_size} tokens (including special tokens)") - print() - - # Encode dataset - max_seq_len = 80 - train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations] - - # Model: Ultra-tiny for speed (learned from 5-min test!) - # Target: ~20-30 steps/sec with longer sequences - # In 10 mins (600s): ~12,000-18,000 steps - config = { - 'vocab_size': vocab_size, - 'embed_dim': 16, # Keep it tiny! - 'num_layers': 1, # Just 1 layer - 'num_heads': 2, # 2 heads - 'max_seq_len': max_seq_len, - } - - print("Model configuration:") - for key, val in config.items(): - print(f" {key}: {val}") - print() - - model = GPT(**config) - num_params = sum(np.prod(p.shape) for p in model.parameters()) - print(f"Parameters: {num_params:,}") - print() - - # Optimizer - optimizer = Adam(model.parameters(), lr=0.001) - loss_fn = CrossEntropyLoss() - - # Train for 15 minutes (adjustable) - train_time = 15 # minutes - print(f"Training for {train_time} minutes...") - print() - - losses, total_steps = train_chatbot( - model=model, - optimizer=optimizer, - loss_fn=loss_fn, - train_data=train_data, - max_time_minutes=train_time - ) - - # Test with sample questions - test_questions = [ - "Hi", - "How are you", - "What is your name", - "What is the sky", - "Is grass green", - "What is 1 plus 1", - "Are you happy", - "Bye", - ] - - print("Testing chatbot responses...") - print() - test_chatbot(model, test_questions, char_to_idx, idx_to_char) - - # Summary - print("=" * 70) - print("TINYTALKS SUMMARY") - print("=" * 70) - print(f"βœ“ Model: {num_params:,} parameters") - print(f"βœ“ Training: {train_time} minutes, {total_steps:,} steps") - print(f"βœ“ Loss: {np.mean(losses[:10]):.4f} β†’ {np.mean(losses[-100:]):.4f}") - print(f"βœ“ Improvement: {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}%") - print() - print("Try it yourself:") - print(" 1. Ask simple questions from the training set") - print(" 2. The model should generate learned responses") - print(" 3. Experiment with model size and training time!") - print() - - -if __name__ == "__main__": - main() - diff --git a/milestones/05_2017_transformer/tinytalks_dashboard.py b/milestones/05_2017_transformer/tinytalks_dashboard.py deleted file mode 100644 index 727d0183..00000000 --- a/milestones/05_2017_transformer/tinytalks_dashboard.py +++ /dev/null @@ -1,546 +0,0 @@ -""" -TinyTalks Interactive Dashboard - Watch Learning Happen Live! -============================================================= - -A beautiful, educational dashboard showing a transformer learn to chat. - -Students see: -- Live training metrics -- Responses improving from gibberish to coherent -- Real-time checkpoints with before/after comparison -- Visual feedback on what's correct vs incorrect -""" - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent.parent)) - -import numpy as np -import time -from tinytorch.core.tensor import Tensor -from tinytorch.core.autograd import enable_autograd -from tinytorch.core.optimizers import Adam -from tinytorch.core.losses import CrossEntropyLoss -from tinytorch.models.transformer import GPT -from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats - -enable_autograd() - -# Rich CLI imports -from rich.console import Console -from rich.panel import Panel -from rich.table import Table -from rich.layout import Layout -from rich.live import Live -from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn -from rich import box -from rich.text import Text - -console = Console() - -# ============================================================================ -# Tokenization (same as tinytalks_chatbot.py) -# ============================================================================ - -def create_tokenizer(conversations): - """Create character-level tokenizer with special tokens.""" - all_text = ' '.join([q + ' ' + a for q, a in conversations]) - all_chars = sorted(set(all_text)) - - special_tokens = { - '': 0, - '': 1, - '': 2, - '': 3, - } - - char_to_idx = {**special_tokens} - idx_to_char = {v: k for k, v in special_tokens.items()} - - for idx, char in enumerate(all_chars, start=len(special_tokens)): - char_to_idx[char] = idx - idx_to_char[idx] = char - - return char_to_idx, idx_to_char - - -def encode_conversation(question, answer, char_to_idx, max_len=80): - """Encode Q&A pair as: question answer ...""" - tokens = [char_to_idx['']] - - for c in question: - tokens.append(char_to_idx.get(c, 0)) - - tokens.append(char_to_idx['']) - - for c in answer: - tokens.append(char_to_idx.get(c, 0)) - - tokens.append(char_to_idx['']) - - if len(tokens) < max_len: - tokens = tokens + [char_to_idx['']] * (max_len - len(tokens)) - else: - tokens = tokens[:max_len] - - return tokens - - -def decode_tokens(tokens, idx_to_char): - """Decode tokens to string.""" - chars = [] - for t in tokens: - if t == 0 or t == 1: # PAD or SOS - continue - elif t == 2: # SEP - continue - elif t == 3: # EOS - break - else: - chars.append(idx_to_char.get(t, '?')) - return ''.join(chars) - - -def generate_response(model, question, char_to_idx, idx_to_char, max_len=50): - """Generate response to a question.""" - tokens = [char_to_idx['']] - for c in question: - tokens.append(char_to_idx.get(c, 0)) - tokens.append(char_to_idx['']) - - generated_tokens = [] - for _ in range(max_len): - input_tokens = tokens + generated_tokens - while len(input_tokens) < 80: - input_tokens.append(char_to_idx['']) - input_tokens = input_tokens[:80] - - x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False) - logits = model.forward(x) - - next_pos = len(tokens) + len(generated_tokens) - 1 - if next_pos < logits.shape[1]: - next_logits = logits.data[0, next_pos, :] - next_token = int(np.argmax(next_logits)) - - if next_token == char_to_idx[''] or next_token == char_to_idx['']: - break - - generated_tokens.append(next_token) - else: - break - - response = decode_tokens(generated_tokens, idx_to_char) - return response - - -# ============================================================================ -# Dashboard Components -# ============================================================================ - -def create_welcome_panel(): - """Create the welcome panel.""" - return Panel.fit( - "[bold cyan]πŸ€– TINYTALKS - Watch a Transformer Learn to Chat![/bold cyan]\n\n" - "[dim]You're about to see AI learning happen in real-time.\n" - "The model starts knowing nothing - just random noise.\n" - "Every training step makes it slightly smarter.\n" - "Watch responses improve from gibberish to coherent conversation![/dim]\n\n" - "[bold]Training Duration:[/bold] 10-15 minutes\n" - "[bold]Checkpoints:[/bold] Every ~2 minutes\n" - "[bold]What to watch:[/bold] Loss ↓ = Better responses βœ“", - title="πŸŽ“ Educational AI Training Demo", - border_style="cyan", - box=box.DOUBLE - ) - - -def create_metrics_table(step, loss, elapsed, steps_per_sec): - """Create current training metrics table.""" - table = Table(show_header=False, box=box.SIMPLE, padding=(0, 2)) - table.add_column("Metric", style="cyan") - table.add_column("Value", style="green bold") - - table.add_row("Step", f"{step:,}") - table.add_row("Loss", f"{loss:.4f}") - table.add_row("Time", f"{int(elapsed/60)}m {int(elapsed%60)}s") - table.add_row("Speed", f"{steps_per_sec:.1f} steps/sec") - - return table - - -def create_checkpoint_comparison(checkpoint_num, step, loss, test_results, expected_answers): - """Create a checkpoint panel showing test results.""" - - # Count correct - correct = 0 - for (q, actual), expected in zip(test_results, expected_answers): - if actual.strip().lower() == expected.strip().lower(): - correct += 1 - - accuracy = (correct / len(test_results)) * 100 - - # Create results table - table = Table( - title=f"Checkpoint {checkpoint_num} - Step {step:,} | Loss: {loss:.4f} | Accuracy: {accuracy:.0f}%", - box=box.ROUNDED, - show_header=True - ) - table.add_column("Question", style="cyan", width=22) - table.add_column("Model Response", style="white", width=28) - table.add_column("Status", justify="center", width=8) - - for (question, actual), expected in zip(test_results, expected_answers): - # Determine if correct - is_correct = actual.strip().lower() == expected.strip().lower() - is_close = expected.strip().lower() in actual.strip().lower() or actual.strip().lower() in expected.strip().lower() - - # Color code and emoji - if is_correct: - status = "[green]βœ“ Perfect[/green]" - response_style = "green" - elif is_close: - status = "[yellow]β‰ˆ Close[/yellow]" - response_style = "yellow" - elif len(actual.strip()) > 0: - status = "[red]βœ— Wrong[/red]" - response_style = "red" - else: - status = "[dim]- Empty[/dim]" - response_style = "dim" - - # Truncate long responses - display_response = actual[:26] + "..." if len(actual) > 26 else actual - - table.add_row( - question, - f"[{response_style}]{display_response}[/{response_style}]", - status - ) - - return table - - -def create_progress_panel(step, total_steps, checkpoint_num, total_checkpoints): - """Create progress indicators panel.""" - step_progress = (step / total_steps) * 100 if total_steps > 0 else 0 - checkpoint_progress = (checkpoint_num / total_checkpoints) * 100 if total_checkpoints > 0 else 0 - - # Progress bars (ASCII style) - step_bar_filled = int(step_progress / 2.5) # 40 chars max - step_bar = "[" + "=" * step_bar_filled + " " * (40 - step_bar_filled) + "]" - - checkpoint_bar_filled = int(checkpoint_progress / 2.5) - checkpoint_bar = "[" + "=" * checkpoint_bar_filled + " " * (40 - checkpoint_bar_filled) + "]" - - text = ( - f"[bold]Training Progress:[/bold]\n" - f"{step_bar} {step_progress:.1f}% ({step}/{total_steps} steps)\n\n" - f"[bold]Checkpoints:[/bold]\n" - f"{checkpoint_bar} {checkpoint_progress:.1f}% ({checkpoint_num}/{total_checkpoints} completed)" - ) - - return Panel(text, title="πŸ“Š Progress", border_style="blue") - - -# ============================================================================ -# Training with Dashboard -# ============================================================================ - -def train_with_dashboard(model, optimizer, loss_fn, train_data, test_questions, expected_answers, - char_to_idx, idx_to_char, max_time_minutes=10, checkpoint_interval_steps=1500): - """ - Train with beautiful dashboard showing live progress. - """ - max_time_seconds = max_time_minutes * 60 - - console.clear() - console.print(create_welcome_panel()) - console.print() - - input("[bold cyan]Press ENTER to start training...[/bold cyan]") - console.clear() - - # Training setup - start_time = time.time() - losses = [] - step = 0 - checkpoint_num = 0 - - # Calculate expected checkpoints - estimated_total_steps = int(max_time_seconds * 12) # ~12 steps/sec - total_checkpoints = estimated_total_steps // checkpoint_interval_steps - - # Initial evaluation - console.print("\n[bold]πŸ“Š CHECKPOINT 0: Initial Model (Untrained)[/bold]\n") - initial_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions] - console.print(create_checkpoint_comparison(0, 0, 999.9, initial_results, expected_answers)) - console.print() - - console.print("[dim]Starting training... Watch the responses improve![/dim]\n") - time.sleep(2) - - next_checkpoint = checkpoint_interval_steps - last_print_time = time.time() - - # Training loop - while True: - elapsed = time.time() - start_time - if elapsed >= max_time_seconds: - break - - # Training step - tokens = train_data[np.random.randint(len(train_data))] - input_seq = tokens[:-1] - target_seq = tokens[1:] - - x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False) - y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False) - - logits = model.forward(x) - - batch_size, seq_len, vocab_size = logits.shape - logits_flat = logits.reshape(batch_size * seq_len, vocab_size) - targets_flat = y_true.reshape(batch_size * seq_len) - loss = loss_fn.forward(logits_flat, targets_flat) - - optimizer.zero_grad() - loss.backward() - - for param in model.parameters(): - if param.grad is not None: - np.clip(param.grad, -1.0, 1.0, out=param.grad) - - optimizer.step() - - losses.append(loss.data.item()) - step += 1 - - # Print progress every 5 seconds - if time.time() - last_print_time >= 5.0: - avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - steps_per_sec = step / elapsed - console.print( - f"[dim]Step {step:5d} | " - f"Loss: {avg_loss:.4f} | " - f"Time: {int(elapsed/60)}m{int(elapsed%60):02d}s | " - f"Speed: {steps_per_sec:.1f} steps/sec[/dim]" - ) - last_print_time = time.time() - - # Checkpoint evaluation - if step >= next_checkpoint: - checkpoint_num += 1 - avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - - console.print("\n" + "="*70) - console.print(f"[bold yellow]⏸️ CHECKPOINT {checkpoint_num}[/bold yellow]") - console.print(f"[dim]Pausing training to evaluate... (Step {step:,})[/dim]\n") - - # Evaluate - current_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions] - - # Show results - console.print(create_checkpoint_comparison(checkpoint_num, step, avg_loss, current_results, expected_answers)) - console.print() - - # Show progress - console.print(create_progress_panel(step, estimated_total_steps, checkpoint_num, total_checkpoints)) - console.print() - - console.print("[dim]Continuing training...[/dim]\n") - next_checkpoint += checkpoint_interval_steps - time.sleep(1) - - # Final results - final_elapsed = time.time() - start_time - final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - initial_loss = np.mean(losses[:10]) - improvement = (1 - final_loss / initial_loss) * 100 - - console.print("\n" + "="*70) - console.print("[bold green]πŸŽ‰ TRAINING COMPLETE![/bold green]\n") - - # Final evaluation - final_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions] - console.print(create_checkpoint_comparison("FINAL", step, final_loss, final_results, expected_answers)) - console.print() - - # Summary table - summary = Table(title="Training Summary", box=box.DOUBLE, show_header=True) - summary.add_column("Metric", style="cyan", width=30) - summary.add_column("Value", style="green bold", width=30) - - summary.add_row("Total Training Time", f"{final_elapsed/60:.1f} minutes") - summary.add_row("Total Steps", f"{step:,}") - summary.add_row("Steps/Second", f"{step/final_elapsed:.1f}") - summary.add_row("Initial Loss", f"{initial_loss:.4f}") - summary.add_row("Final Loss", f"{final_loss:.4f}") - summary.add_row("Improvement", f"{improvement:.1f}%") - summary.add_row("Checkpoints Evaluated", f"{checkpoint_num}") - - console.print(summary) - console.print() - - # Count perfect responses for milestone card - correct = sum(1 for (q, actual), expected in zip(final_results, expected_answers) - if actual.strip().lower() == expected.strip().lower()) - accuracy = (correct / len(test_questions)) * 100 - - return losses, step, accuracy - - -# ============================================================================ -# Main -# ============================================================================ - -def main(): - # Dataset - conversations = create_tinytalks_dataset() - char_to_idx, idx_to_char = create_tokenizer(conversations) - vocab_size = len(idx_to_char) - - # Encode - max_seq_len = 80 - train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations] - - # Test questions and expected answers - test_questions = [ - "Hi", - "How are you", - "What is your name", - "What is the sky", - "Is grass green", - "What is 1 plus 1", - "Are you happy" - ] - - expected_answers = [ - "Hello! How can I help you?", - "I am doing well, thanks!", - "I am TinyBot", - "The sky is blue", - "Yes, grass is green", - "1 plus 1 equals 2", - "Yes, I am happy" - ] - - # Model - config = { - 'vocab_size': vocab_size, - 'embed_dim': 16, - 'num_layers': 1, - 'num_heads': 2, - 'max_seq_len': max_seq_len, - } - - model = GPT(**config) - num_params = sum(np.prod(p.shape) for p in model.parameters()) - - # Optimizer - optimizer = Adam(model.parameters(), lr=0.001) - loss_fn = CrossEntropyLoss() - - # Train with dashboard - train_time = 15 # 15 minutes for better results - checkpoint_interval = 2000 # Every ~2.5 minutes - - console.print(Panel.fit( - f"[bold]Model:[/bold] {num_params:,} parameters (ultra-tiny!)\n" - f"[bold]Training Time:[/bold] {train_time} minutes\n" - f"[bold]Checkpoints:[/bold] Every {checkpoint_interval} steps (~2 min)\n" - f"[bold]Test Questions:[/bold] {len(test_questions)} questions\n\n" - f"[dim]Watch loss decrease and responses improve![/dim]", - title="βš™οΈ Configuration", - border_style="blue" - )) - - losses, total_steps, final_accuracy = train_with_dashboard( - model=model, - optimizer=optimizer, - loss_fn=loss_fn, - train_data=train_data, - test_questions=test_questions, - expected_answers=expected_answers, - char_to_idx=char_to_idx, - idx_to_char=idx_to_char, - max_time_minutes=train_time, - checkpoint_interval_steps=checkpoint_interval - ) - - # Calculate metrics for milestone card - loss_improvement = (1 - np.mean(losses[-100:]) / np.mean(losses[:10])) * 100 - - # Milestone completion card - console.print() - if final_accuracy >= 50 and loss_improvement >= 80: - console.print(Panel.fit( - "[bold green]πŸŽ‰ Congratulations! You've Built a Working Chatbot![/bold green]\n\n" - - f"Final accuracy: [bold]{final_accuracy:.0f}%[/bold] | " - f"Loss improved: [bold]{loss_improvement:.1f}%[/bold]\n\n" - - "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" - - "[bold]πŸ’‘ What YOU Just Accomplished:[/bold]\n" - " βœ“ Built a TRANSFORMER (2017 Vaswani et al)\n" - " βœ“ Trained with attention mechanism from scratch\n" - " βœ“ Watched AI learn language patterns in real-time\n" - " βœ“ Demonstrated gradient descent on complex architectures\n" - f" βœ“ Trained {total_steps:,} steps in {train_time} minutes!\n\n" - - "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" - - "[bold]πŸŽ“ Why This Matters:[/bold]\n" - " This is the SAME architecture behind ChatGPT, GPT-4, and BERT.\n" - " You just witnessed the magic of:\n" - " β€’ Self-attention (learning relationships between words)\n" - " β€’ Position encoding (understanding word order)\n" - " β€’ Autoregressive generation (predicting next token)\n\n" - - "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" - - "[bold]πŸ“Œ The Key Insight:[/bold]\n" - " You saw responses evolve from gibberish to coherent:\n" - " Checkpoint 0: Random noise\n" - " Checkpoint 1: Recognizable words\n" - " Checkpoint 2: Partial sentences\n" - " Final: Perfect responses!\n" - " \n" - " [yellow]Scale it up:[/yellow] Same process, more data, more params β†’\n" - " You get GPT-4 (175B params, trained for weeks)!\n\n" - - "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" - - "[bold]πŸš€ What You Can Do Now:[/bold]\n" - "β€’ Experiment with different architectures (layers, heads)\n" - "β€’ Try longer training (15-20 minutes for better results)\n" - "β€’ Add more conversation patterns to the dataset\n" - "β€’ Scale up the model (more parameters = better learning)\n\n" - - "[bold cyan]You've mastered the foundation of modern AI! 🌟[/bold cyan]", - - title="🌟 2017 Transformer Complete - Milestone 05", - border_style="green", - box=box.DOUBLE - )) - else: - console.print(Panel.fit( - "[bold yellow]⚠️ Training Complete - Needs More Time[/bold yellow]\n\n" - f"Current accuracy: {final_accuracy:.0f}% | Loss improved: {loss_improvement:.1f}%\n\n" - "Your transformer is learning but needs more training time.\n\n" - "[bold]What to try:[/bold]\n" - "β€’ Train for 15-20 minutes instead of 10\n" - "β€’ Use a slightly bigger model (2 layers, 24 dims)\n" - "β€’ Add more data repetition for reinforcement\n\n" - "[dim]The attention mechanism is working - it just needs more steps to converge!\n" - "Even partial success shows the transformer learned patterns.[/dim]", - title="πŸ”„ Learning in Progress", - border_style="yellow", - box=box.DOUBLE - )) - - -if __name__ == "__main__": - main() - diff --git a/milestones/05_2017_transformer/tinytalks_dataset.py b/milestones/05_2017_transformer/tinytalks_dataset.py deleted file mode 100644 index 50122fe6..00000000 --- a/milestones/05_2017_transformer/tinytalks_dataset.py +++ /dev/null @@ -1,208 +0,0 @@ -""" -TinyTalks Dataset - Small Conversational Dataset for Transformer Training -========================================================================== - -A carefully curated micro-dataset for training a chatbot in 10-15 minutes. - -Design Principles: -- Simple Q&A format -- Short responses (< 50 characters) -- Common patterns repeated for learning -- Character-level tokenization friendly -- Covers basic conversational topics -""" - -def create_tinytalks_dataset(): - """ - Create a small but diverse conversational dataset. - - Categories: - 1. Greetings & Politeness - 2. Simple Facts - 3. Yes/No Questions - 4. Weather & Time - 5. Feelings & States - 6. Simple Math - 7. Colors & Objects - 8. Names & Identity - """ - - conversations = [ - # === GREETINGS (repeated 5x for strong learning) === - ("Hi", "Hello! How can I help you?"), - ("Hello", "Hi there! How are you?"), - ("Good morning", "Good morning! Nice day!"), - ("How are you", "I am doing well, thanks!"), - ("Bye", "Goodbye! Have a great day!"), - - # Repeat for learning - ("Hi", "Hello! How can I help you?"), - ("Hello", "Hi there! How are you?"), - ("How are you", "I am doing well, thanks!"), - ("Bye", "Goodbye! Have a great day!"), - - ("Hi", "Hello! How can I help you?"), - ("Hello", "Hi there! How are you?"), - ("Bye", "Goodbye! Have a great day!"), - - # === SIMPLE FACTS (repeated 3x) === - ("What is the sky", "The sky is blue"), - ("What is grass", "Grass is green"), - ("What is the sun", "The sun is bright and yellow"), - ("What is water", "Water is wet and clear"), - ("What is fire", "Fire is hot and red"), - - ("What is the sky", "The sky is blue"), - ("What is grass", "Grass is green"), - ("What is water", "Water is wet and clear"), - - ("What is the sky", "The sky is blue"), - ("What is grass", "Grass is green"), - - # === YES/NO QUESTIONS (repeated 3x) === - ("Is the sky blue", "Yes, the sky is blue"), - ("Is grass green", "Yes, grass is green"), - ("Is the sun cold", "No, the sun is hot"), - ("Is water dry", "No, water is wet"), - ("Is fire safe", "No, fire is dangerous"), - - ("Is the sky blue", "Yes, the sky is blue"), - ("Is grass green", "Yes, grass is green"), - ("Is the sun cold", "No, the sun is hot"), - - ("Is the sky blue", "Yes, the sky is blue"), - ("Is grass green", "Yes, grass is green"), - - # === WEATHER (repeated 3x) === - ("Is it sunny", "Yes, it is sunny today"), - ("Is it raining", "No, it is not raining"), - ("Is it cold", "No, it is warm today"), - ("What is the weather", "The weather is nice"), - - ("Is it sunny", "Yes, it is sunny today"), - ("What is the weather", "The weather is nice"), - - ("Is it sunny", "Yes, it is sunny today"), - - # === FEELINGS (repeated 3x) === - ("Are you happy", "Yes, I am happy"), - ("Are you sad", "No, I am not sad"), - ("Are you tired", "No, I feel good"), - ("Do you like learning", "Yes, I love learning"), - - ("Are you happy", "Yes, I am happy"), - ("Do you like learning", "Yes, I love learning"), - - ("Are you happy", "Yes, I am happy"), - - # === SIMPLE MATH (repeated 3x) === - ("What is 1 plus 1", "1 plus 1 equals 2"), - ("What is 2 plus 2", "2 plus 2 equals 4"), - ("What is 3 plus 3", "3 plus 3 equals 6"), - ("What is 5 plus 5", "5 plus 5 equals 10"), - - ("What is 1 plus 1", "1 plus 1 equals 2"), - ("What is 2 plus 2", "2 plus 2 equals 4"), - - ("What is 1 plus 1", "1 plus 1 equals 2"), - - # === COLORS (repeated 3x) === - ("What color is the sky", "The sky is blue"), - ("What color is grass", "Grass is green"), - ("What color is the sun", "The sun is yellow"), - ("What color is snow", "Snow is white"), - - ("What color is the sky", "The sky is blue"), - ("What color is grass", "Grass is green"), - - ("What color is the sky", "The sky is blue"), - - # === IDENTITY (repeated 3x) === - ("What is your name", "I am TinyBot"), - ("Who are you", "I am TinyBot, your helper"), - ("What do you do", "I help answer questions"), - - ("What is your name", "I am TinyBot"), - ("Who are you", "I am TinyBot, your helper"), - - ("What is your name", "I am TinyBot"), - - # === CAPABILITIES (repeated 2x) === - ("Can you help me", "Yes, I can help you"), - ("Can you talk", "Yes, I can talk with you"), - ("Do you understand", "Yes, I understand you"), - - ("Can you help me", "Yes, I can help you"), - ("Can you talk", "Yes, I can talk with you"), - ] - - return conversations - - -def get_dataset_stats(): - """Get statistics about the dataset.""" - conversations = create_tinytalks_dataset() - - unique_conversations = set(conversations) - total_chars = sum(len(q) + len(a) for q, a in conversations) - avg_question_len = sum(len(q) for q, _ in conversations) / len(conversations) - avg_answer_len = sum(len(a) for _, a in conversations) / len(conversations) - - return { - 'total_examples': len(conversations), - 'unique_examples': len(unique_conversations), - 'repetition_factor': len(conversations) / len(unique_conversations), - 'total_chars': total_chars, - 'avg_question_len': avg_question_len, - 'avg_answer_len': avg_answer_len, - 'categories': [ - 'Greetings (5x repeat)', - 'Simple Facts (3x repeat)', - 'Yes/No Questions (3x repeat)', - 'Weather (3x repeat)', - 'Feelings (3x repeat)', - 'Simple Math (3x repeat)', - 'Colors (3x repeat)', - 'Identity (3x repeat)', - 'Capabilities (2x repeat)' - ] - } - - -def print_dataset_info(): - """Print dataset information.""" - conversations = create_tinytalks_dataset() - stats = get_dataset_stats() - - print("=" * 70) - print("TINYTALKS DATASET") - print("=" * 70) - print() - print(f"Total examples: {stats['total_examples']}") - print(f"Unique examples: {stats['unique_examples']}") - print(f"Repetition factor: {stats['repetition_factor']:.1f}x") - print(f"Average question length: {stats['avg_question_len']:.1f} chars") - print(f"Average answer length: {stats['avg_answer_len']:.1f} chars") - print() - print("Categories:") - for cat in stats['categories']: - print(f" β€’ {cat}") - print() - print("Sample conversations:") - print("-" * 70) - - # Show 10 random unique examples - unique = list(set(conversations)) - import random - random.seed(42) - samples = random.sample(unique, min(10, len(unique))) - - for q, a in samples: - print(f"Q: {q}") - print(f"A: {a}") - print() - - -if __name__ == "__main__": - print_dataset_info() - diff --git a/milestones/05_2017_transformer/tinytalks_gpt.py b/milestones/05_2017_transformer/tinytalks_gpt.py deleted file mode 100755 index 03153f80..00000000 --- a/milestones/05_2017_transformer/tinytalks_gpt.py +++ /dev/null @@ -1,746 +0,0 @@ -#!/usr/bin/env python3 -""" -TinyTalks Q&A Generation (2017) - Transformer Era -================================================== - -πŸ“š HISTORICAL CONTEXT: -In 2017, Vaswani et al. published "Attention Is All You Need", showing that -attention mechanisms alone (no RNNs!) could achieve state-of-the-art results -on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs. - -🎯 WHAT YOU'RE BUILDING: -Using YOUR TinyTorch implementations, you'll build a character-level conversational -model that learns to answer questions - proving YOUR attention mechanism works! - -TinyTalks is PERFECT for learning: -- Small dataset (17.5 KB) = 3-5 minute training! -- Clear Q&A format (easy to verify learning) -- Progressive difficulty (5 levels) -- Instant gratification: Watch your transformer learn to chat! - -βœ… REQUIRED MODULES (Run after Module 13): -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - Module 01 (Tensor) : YOUR data structure with autograd - Module 02 (Activations) : YOUR ReLU and GELU activations - Module 03 (Layers) : YOUR Linear layers - Module 04 (Losses) : YOUR CrossEntropyLoss - Module 05 (Autograd) : YOUR automatic differentiation - Module 06 (Optimizers) : YOUR Adam optimizer - Module 08 (DataLoader) : YOUR data batching - Module 10 (Tokenization) : YOUR CharTokenizer for textβ†’numbers - Module 11 (Embeddings) : YOUR token & positional embeddings - Module 12 (Attention) : YOUR multi-head self-attention - Module 13 (Transformers) : YOUR LayerNorm + TransformerBlock + GPT -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -πŸ—οΈ ARCHITECTURE (Character-Level Q&A Model): - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Output Predictions β”‚ - β”‚ Character Probabilities (vocab_size) β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Output Projection β”‚ - β”‚ Module 03: vectors β†’ vocabulary β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Layer Norm β”‚ - β”‚ Module 13: Final normalization β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - ╔══════════════════════════════════════════════════════════════════════════════╗ - β•‘ Transformer Block Γ— N (Repeat) β•‘ - β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ - β•‘ β”‚ Feed Forward Network β”‚ β•‘ - β•‘ β”‚ Module 03: Linear β†’ GELU β†’ Linear β”‚ β•‘ - β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ - β•‘ β–² β•‘ - β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ - β•‘ β”‚ Multi-Head Self-Attention β”‚ β•‘ - β•‘ β”‚ Module 12: QueryΒ·Key^TΒ·Value across all positions β”‚ β•‘ - β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ - β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Positional Encoding β”‚ - β”‚ Module 11: Add position information β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Character Embeddings β”‚ - β”‚ Module 11: chars β†’ embed_dim vectors β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Input Characters β”‚ - β”‚ "Q: What color is the sky? A:" β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - -πŸ“Š EXPECTED PERFORMANCE: -- Dataset: 17.5 KB TinyTalks (301 Q&A pairs, 5 difficulty levels) -- Training time: 3-5 minutes (instant gratification!) -- Vocabulary: ~68 unique characters (simple English Q&A) -- Expected: 70-80% accuracy on Level 1-2 questions after training -- Parameters: ~1.2M (perfect size for fast learning on small data) - -πŸ’‘ WHAT TO WATCH FOR: -- Epoch 1-3: Model learns Q&A structure ("A:" follows "Q:") -- Epoch 4-7: Starts giving sensible (if incorrect) answers -- Epoch 8-12: 50-60% accuracy on simple questions -- Epoch 13-20: 70-80% accuracy, proper grammar -- Success = "Wow, my transformer actually learned to answer questions!" -""" - -import sys -import os -import numpy as np -import argparse -import time -from rich.console import Console -from rich.panel import Panel -from rich.table import Table -from rich import box - -# Add project root to path -project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(project_root) - -console = Console() - - -def print_banner(): - """Print a beautiful banner for the milestone""" - banner_text = """ -╔══════════════════════════════════════════════════════════════════╗ -β•‘ β•‘ -β•‘ πŸ€– TinyTalks Q&A Bot Training (2017) β•‘ -β•‘ Transformer Architecture β•‘ -β•‘ β•‘ -β•‘ "Your first transformer learning to answer questions!" β•‘ -β•‘ β•‘ -β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• - """ - console.print(Panel(banner_text, border_style="bright_blue", box=box.DOUBLE)) - - -def filter_by_levels(text, levels): - """ - Filter TinyTalks dataset to only include specified difficulty levels. - - Levels are marked in the original generation as: - L1: Greetings (47 pairs) - L2: Facts (82 pairs) - L3: Math (45 pairs) - L4: Reasoning (87 pairs) - L5: Context (40 pairs) - - For simplicity, we filter by common patterns: - L1: Hello, Hi, What is your name, etc. - L2: What color, How many, etc. - L3: What is X plus/minus, etc. - """ - if levels is None or levels == [1, 2, 3, 4, 5]: - return text # Use full dataset - - # Parse Q&A pairs - pairs = [] - blocks = text.strip().split('\n\n') - - for block in blocks: - lines = block.strip().split('\n') - if len(lines) == 2 and lines[0].startswith('Q:') and lines[1].startswith('A:'): - q = lines[0][3:].strip() - a = lines[1][3:].strip() - - # Classify level (heuristic) - level = 5 # default - q_lower = q.lower() - - if any(word in q_lower for word in ['hello', 'hi', 'hey', 'goodbye', 'bye', 'name', 'who are you', 'what are you']): - level = 1 - elif any(word in q_lower for word in ['color', 'legs', 'days', 'months', 'sound', 'capital']): - level = 2 - elif any(word in q_lower for word in ['plus', 'minus', 'times', 'divided', 'equals']): - level = 3 - elif any(word in q_lower for word in ['use', 'where do', 'what do', 'happens if', 'need to']): - level = 4 - - if level in levels: - pairs.append(f"Q: {q}\nA: {a}") - - filtered_text = '\n\n'.join(pairs) - console.print(f"[yellow]πŸ“Š Filtered to Level(s) {levels}:[/yellow]") - console.print(f" Q&A pairs: {len(pairs)}") - console.print(f" Characters: {len(filtered_text)}") - - return filtered_text - - -class TinyTalksDataset: - """ - Character-level dataset for TinyTalks Q&A. - - Creates sequences of characters for autoregressive language modeling: - - Input: "Q: What color is the sky? A: The sk" - - Target: ": What color is the sky? A: The sky" - - The model learns to predict the next character given previous characters, - naturally learning the Q&A pattern. - """ - - def __init__(self, text, seq_length=64, levels=None): - """ - Args: - text: Full text string (Q&A pairs) - seq_length: Length of input sequences - levels: List of difficulty levels to include (1-5), None = all - """ - from tinytorch.text.tokenization import CharTokenizer - - self.seq_length = seq_length - - # Filter by levels if specified - if levels: - text = filter_by_levels(text, levels) - - # Store original text for testing - self.text = text - - # Build character vocabulary using CharTokenizer - self.tokenizer = CharTokenizer() - self.tokenizer.build_vocab([text]) - - # Encode entire text - self.data = self.tokenizer.encode(text) - - console.print(f"[green]βœ“[/green] Dataset initialized:") - console.print(f" Total characters: {len(text)}") - console.print(f" Vocabulary size: {self.tokenizer.vocab_size}") - console.print(f" Sequence length: {seq_length}") - console.print(f" Total sequences: {len(self)}") - - def __len__(self): - """Number of possible sequences""" - return len(self.data) - self.seq_length - - def __getitem__(self, idx): - """ - Get one training example. - - Returns: - input_seq: Characters [idx : idx+seq_length] - target_seq: Characters [idx+1 : idx+seq_length+1] (shifted by 1) - """ - input_seq = self.data[idx:idx + self.seq_length] - target_seq = self.data[idx + 1:idx + self.seq_length + 1] - return input_seq, target_seq - - def decode(self, indices): - """Decode token indices back to text""" - return self.tokenizer.decode(indices) - - -class TinyGPT: - """ - Character-level GPT model for TinyTalks Q&A. - - This is a simplified GPT architecture: - 1. Token embeddings (convert characters to vectors) - 2. Positional encodings (add position information) - 3. N transformer blocks (self-attention + feed-forward) - 4. Output projection (vectors back to character probabilities) - - Built entirely from YOUR TinyTorch modules! - """ - - def __init__(self, vocab_size, embed_dim=128, num_layers=4, num_heads=4, - max_seq_len=64, dropout=0.1): - """ - Args: - vocab_size: Number of unique characters - embed_dim: Dimension of embeddings and hidden states - num_layers: Number of transformer blocks - num_heads: Number of attention heads per block - max_seq_len: Maximum sequence length - dropout: Dropout probability (for training) - """ - from tinytorch.core.tensor import Tensor - from tinytorch.text.embeddings import Embedding, PositionalEncoding - from tinytorch.models.transformer import LayerNorm, TransformerBlock - from tinytorch.core.layers import Linear - - self.vocab_size = vocab_size - self.embed_dim = embed_dim - self.num_layers = num_layers - self.num_heads = num_heads - self.max_seq_len = max_seq_len - - # 1. Token embeddings: char_id β†’ embed_dim vector - self.token_embedding = Embedding(vocab_size, embed_dim) - - # 2. Positional encoding: add position information - self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim) - - # 3. Transformer blocks (stacked) - self.blocks = [] - for _ in range(num_layers): - block = TransformerBlock( - embed_dim=embed_dim, - num_heads=num_heads, - mlp_ratio=4, # FFN hidden_dim = 4 * embed_dim - dropout_prob=dropout - ) - self.blocks.append(block) - - # 4. Final layer normalization - self.ln_f = LayerNorm(embed_dim) - - # 5. Output projection: embed_dim β†’ vocab_size - self.output_proj = Linear(embed_dim, vocab_size) - - console.print(f"[green]βœ“[/green] TinyGPT model initialized:") - console.print(f" Vocabulary: {vocab_size}") - console.print(f" Embedding dim: {embed_dim}") - console.print(f" Layers: {num_layers}") - console.print(f" Heads: {num_heads}") - console.print(f" Max sequence: {max_seq_len}") - - # Count parameters - total_params = self.count_parameters() - console.print(f" [bold]Total parameters: {total_params:,}[/bold]") - - def forward(self, x): - """ - Forward pass through the model. - - Args: - x: Input tensor of shape (batch, seq_len) with token indices - - Returns: - logits: Output tensor of shape (batch, seq_len, vocab_size) - """ - from tinytorch.core.tensor import Tensor - - # 1. Token embeddings: (batch, seq_len) β†’ (batch, seq_len, embed_dim) - x = self.token_embedding.forward(x) - - # 2. Add positional encoding - x = self.pos_encoding.forward(x) - - # 3. Pass through transformer blocks - for block in self.blocks: - x = block.forward(x) - - # 4. Final layer norm - x = self.ln_f.forward(x) - - # 5. Project to vocabulary: (batch, seq_len, embed_dim) β†’ (batch, seq_len, vocab_size) - logits = self.output_proj.forward(x) - - return logits - - def parameters(self): - """Get all trainable parameters""" - params = [] - - # Token embeddings - params.extend(self.token_embedding.parameters()) - - # Positional encoding (learnable parameters) - params.extend(self.pos_encoding.parameters()) - - # Transformer blocks - for block in self.blocks: - params.extend(block.parameters()) - - # Final layer norm - params.extend(self.ln_f.parameters()) - - # Output projection - params.extend(self.output_proj.parameters()) - - # Ensure all require gradients - for param in params: - param.requires_grad = True - - return params - - def count_parameters(self): - """Count total trainable parameters""" - total = 0 - for param in self.parameters(): - total += param.data.size - return total - - def generate(self, tokenizer, prompt="Q:", max_new_tokens=100, temperature=1.0): - """ - Generate text autoregressively. - - Args: - tokenizer: CharTokenizer for encoding/decoding - prompt: Starting text - max_new_tokens: How many characters to generate - temperature: Sampling temperature (higher = more random) - - Returns: - Generated text string - """ - from tinytorch.core.tensor import Tensor - - # Encode prompt - indices = tokenizer.encode(prompt) - - # Generate tokens one at a time - for _ in range(max_new_tokens): - # Get last max_seq_len tokens (context window) - context = indices[-self.max_seq_len:] - - # Prepare input: (1, seq_len) - x_input = Tensor(np.array([context])) - - # Forward pass - logits = self.forward(x_input) - - # Get logits for last position: (vocab_size,) - last_logits = logits.data[0, -1, :] / temperature - - # Apply softmax to get probabilities - exp_logits = np.exp(last_logits - np.max(last_logits)) - probs = exp_logits / np.sum(exp_logits) - - # Sample from distribution - next_idx = np.random.choice(len(probs), p=probs) - - # Append to sequence - indices.append(next_idx) - - # Stop if we generate newline after "A:" - if len(indices) > 3 and tokenizer.decode(indices[-3:]) == "\n\nQ": - break - - return tokenizer.decode(indices) - - -def test_model_predictions(model, dataset, test_prompts=None): - """Test model on specific prompts and show predictions""" - if test_prompts is None: - test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: Hi!"] - - console.print("\n[bold yellow]πŸ§ͺ Testing Live Predictions:[/bold yellow]") - for prompt in test_prompts: - try: - full_prompt = prompt + "\nA:" - response = model.generate(dataset.tokenizer, prompt=full_prompt, max_new_tokens=30, temperature=0.5) - - # Extract just the answer - if "\nA:" in response: - answer = response.split("\nA:")[1].split("\n")[0].strip() - else: - answer = response[len(full_prompt):].strip() - - console.print(f" {prompt}") - console.print(f" β†’ [cyan]{answer}[/cyan]") - except Exception as e: - console.print(f" {prompt} β†’ [red]Error: {str(e)[:50]}[/red]") - - -def train_tinytalks_gpt(model, dataset, optimizer, criterion, epochs=20, batch_size=32, - log_interval=50, test_prompts=None): - """ - Train the TinyGPT model on TinyTalks dataset. - - Training loop: - 1. Sample random batch of sequences - 2. Forward pass: predict next character for each position - 3. Compute cross-entropy loss - 4. Backward pass: compute gradients - 5. Update parameters with Adam - 6. Periodically test on sample questions to show learning - - Args: - model: TinyGPT instance - dataset: TinyTalksDataset instance - optimizer: Adam optimizer - criterion: CrossEntropyLoss - epochs: Number of training epochs - batch_size: Number of sequences per batch - log_interval: Print loss every N batches - test_prompts: Optional list of questions to test during training - """ - from tinytorch.core.tensor import Tensor - from tinytorch.core.autograd import enable_autograd - - # Enable autograd - enable_autograd() - - console.print("\n[bold cyan]Starting Training...[/bold cyan]") - console.print(f" Epochs: {epochs}") - console.print(f" Batch size: {batch_size}") - console.print(f" Dataset size: {len(dataset)} sequences") - - start_time = time.time() - - for epoch in range(epochs): - epoch_start = time.time() - epoch_loss = 0.0 - num_batches = 0 - - # Calculate batches per epoch - batches_per_epoch = min(500, len(dataset) // batch_size) - - for batch_idx in range(batches_per_epoch): - # Sample random batch - batch_indices = np.random.randint(0, len(dataset), size=batch_size) - - batch_inputs = [] - batch_targets = [] - - for idx in batch_indices: - input_seq, target_seq = dataset[int(idx)] - batch_inputs.append(input_seq) - batch_targets.append(target_seq) - - # Convert to tensors: (batch, seq_len) - batch_input = Tensor(np.array(batch_inputs)) - batch_target = Tensor(np.array(batch_targets)) - - # Forward pass - logits = model.forward(batch_input) - - # Reshape for loss computation: (batch, seq, vocab) β†’ (batch*seq, vocab) - # IMPORTANT: Use Tensor.reshape() to preserve computation graph! - batch_size_actual, seq_length, vocab_size = logits.shape - logits_2d = logits.reshape(batch_size_actual * seq_length, vocab_size) - targets_1d = batch_target.reshape(-1) - - # Compute loss - loss = criterion.forward(logits_2d, targets_1d) - - # Backward pass - loss.backward() - - # Update parameters - optimizer.step() - - # Zero gradients - optimizer.zero_grad() - - # Track loss - batch_loss = float(loss.data) - epoch_loss += batch_loss - num_batches += 1 - - # Log progress - if (batch_idx + 1) % log_interval == 0 or batch_idx == 0: - avg_loss = epoch_loss / num_batches - elapsed = time.time() - start_time - console.print( - f" Epoch {epoch+1}/{epochs} | " - f"Batch {batch_idx+1}/{batches_per_epoch} | " - f"Loss: {batch_loss:.4f} | " - f"Avg: {avg_loss:.4f} | " - f"Time: {elapsed:.1f}s" - ) - - # Epoch summary - avg_epoch_loss = epoch_loss / num_batches - epoch_time = time.time() - epoch_start - console.print( - f"[green]βœ“[/green] Epoch {epoch+1}/{epochs} complete | " - f"Avg Loss: {avg_epoch_loss:.4f} | " - f"Time: {epoch_time:.1f}s" - ) - - # Test model every 5 epochs to show learning progress - if (epoch + 1) % 5 == 0 or epoch == 0 or epoch == epochs - 1: - test_model_predictions(model, dataset, test_prompts) - - total_time = time.time() - start_time - console.print(f"\n[bold green]βœ“ Training complete![/bold green]") - console.print(f" Total time: {total_time/60:.2f} minutes") - - -def demo_questions(model, tokenizer): - """ - Demonstrate the model answering questions. - - Shows how well the model learned from TinyTalks by asking - various questions from different difficulty levels. - """ - console.print("\n" + "=" * 70) - console.print("[bold cyan]πŸ€– TinyBot Demo: Ask Me Questions![/bold cyan]") - console.print("=" * 70) - - # Test questions from different levels - test_questions = [ - "Q: Hello!", - "Q: What is your name?", - "Q: What color is the sky?", - "Q: How many legs does a dog have?", - "Q: What is 2 plus 3?", - "Q: What do you use a pen for?", - ] - - for question in test_questions: - console.print(f"\n[yellow]{question}[/yellow]") - - # Generate answer - response = model.generate(tokenizer, prompt=question + "\nA:", max_new_tokens=50, temperature=0.8) - - # Extract just the answer part - if "\nA:" in response: - answer = response.split("\nA:")[1].split("\n")[0].strip() - console.print(f"[green]A: {answer}[/green]") - else: - console.print(f"[dim]{response}[/dim]") - - console.print("\n" + "=" * 70) - - -def main(): - """Main training pipeline""" - parser = argparse.ArgumentParser(description='Train TinyGPT on TinyTalks Q&A') - parser.add_argument('--epochs', type=int, default=30, help='Number of training epochs (default: 30)') - parser.add_argument('--batch-size', type=int, default=16, help='Batch size (default: 16)') - parser.add_argument('--lr', type=float, default=0.001, help='Learning rate (default: 0.001)') - parser.add_argument('--seq-length', type=int, default=64, help='Sequence length (default: 64)') - parser.add_argument('--embed-dim', type=int, default=96, help='Embedding dimension (default: 96, ~500K params)') - parser.add_argument('--num-layers', type=int, default=4, help='Number of transformer layers (default: 4)') - parser.add_argument('--num-heads', type=int, default=4, help='Number of attention heads (default: 4)') - parser.add_argument('--levels', type=str, default=None, help='Difficulty levels to train on (e.g. "1" or "1,2"). Default: all levels') - args = parser.parse_args() - - # Parse levels argument - if args.levels: - levels = [int(l.strip()) for l in args.levels.split(',')] - else: - levels = None - - print_banner() - - # Import TinyTorch components - console.print("\n[bold]Importing TinyTorch components...[/bold]") - try: - from tinytorch.core.tensor import Tensor - from tinytorch.core.optimizers import Adam - from tinytorch.core.losses import CrossEntropyLoss - from tinytorch.text.tokenization import CharTokenizer - console.print("[green]βœ“[/green] All modules imported successfully!") - except ImportError as e: - console.print(f"[red]βœ—[/red] Import error: {e}") - console.print("\nMake sure you have completed all required modules:") - console.print(" - Module 01 (Tensor)") - console.print(" - Module 02 (Activations)") - console.print(" - Module 03 (Layers)") - console.print(" - Module 04 (Losses)") - console.print(" - Module 05 (Autograd)") - console.print(" - Module 06 (Optimizers)") - console.print(" - Module 10 (Tokenization)") - console.print(" - Module 11 (Embeddings)") - console.print(" - Module 12 (Attention)") - console.print(" - Module 13 (Transformers)") - return - - # Load TinyTalks dataset - console.print("\n[bold]Loading TinyTalks dataset...[/bold]") - dataset_path = os.path.join(project_root, "datasets", "tinytalks", "splits", "train.txt") - - if not os.path.exists(dataset_path): - console.print(f"[red]βœ—[/red] Dataset not found: {dataset_path}") - console.print("\nPlease generate the dataset first:") - console.print(" python datasets/tinytalks/scripts/generate_tinytalks.py") - return - - with open(dataset_path, 'r', encoding='utf-8') as f: - text = f.read() - - console.print(f"[green]βœ“[/green] Loaded dataset from: {os.path.basename(dataset_path)}") - console.print(f" File size: {len(text)} characters") - - # Create dataset with level filtering - dataset = TinyTalksDataset(text, seq_length=args.seq_length, levels=levels) - - # Set test prompts based on levels - if levels and 1 in levels: - test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: Hi!"] - elif levels and 2 in levels: - test_prompts = ["Q: What color is the sky?", "Q: How many legs does a dog have?"] - elif levels and 3 in levels: - test_prompts = ["Q: What is 2 plus 3?", "Q: What is 5 minus 2?"] - else: - test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: What color is the sky?"] - - # Initialize model - console.print("\n[bold]Initializing TinyGPT model...[/bold]") - model = TinyGPT( - vocab_size=dataset.tokenizer.vocab_size, - embed_dim=args.embed_dim, - num_layers=args.num_layers, - num_heads=args.num_heads, - max_seq_len=args.seq_length, - dropout=0.1 - ) - - # Initialize optimizer and loss - console.print("\n[bold]Initializing training components...[/bold]") - optimizer = Adam(model.parameters(), lr=args.lr) - criterion = CrossEntropyLoss() - console.print(f"[green]βœ“[/green] Optimizer: Adam (lr={args.lr})") - console.print(f"[green]βœ“[/green] Loss: CrossEntropyLoss") - - # Print configuration - table = Table(title="Training Configuration", box=box.ROUNDED) - table.add_column("Parameter", style="cyan") - table.add_column("Value", style="green") - - dataset_desc = f"TinyTalks Level(s) {levels}" if levels else "TinyTalks (All Levels)" - table.add_row("Dataset", dataset_desc) - table.add_row("Vocabulary Size", str(dataset.tokenizer.vocab_size)) - table.add_row("Model Parameters", f"{model.count_parameters():,}") - table.add_row("Epochs", str(args.epochs)) - table.add_row("Batch Size", str(args.batch_size)) - table.add_row("Learning Rate", str(args.lr)) - table.add_row("Sequence Length", str(args.seq_length)) - table.add_row("Embedding Dim", str(args.embed_dim)) - table.add_row("Layers", str(args.num_layers)) - table.add_row("Attention Heads", str(args.num_heads)) - table.add_row("Expected Time", "3-5 minutes") - - console.print(table) - - # Train model - train_tinytalks_gpt( - model=model, - dataset=dataset, - optimizer=optimizer, - criterion=criterion, - epochs=args.epochs, - batch_size=args.batch_size, - log_interval=50, - test_prompts=test_prompts - ) - - # Demo Q&A - demo_questions(model, dataset.tokenizer) - - # Success message - console.print("\n[bold green]πŸŽ‰ Congratulations![/bold green]") - console.print("You've successfully trained a transformer to answer questions!") - console.print("\nYou used:") - console.print(" βœ“ YOUR Tensor implementation (Module 01)") - console.print(" βœ“ YOUR Activations (Module 02)") - console.print(" βœ“ YOUR Linear layers (Module 03)") - console.print(" βœ“ YOUR CrossEntropyLoss (Module 04)") - console.print(" βœ“ YOUR Autograd system (Module 05)") - console.print(" βœ“ YOUR Adam optimizer (Module 06)") - console.print(" βœ“ YOUR CharTokenizer (Module 10)") - console.print(" βœ“ YOUR Embeddings (Module 11)") - console.print(" βœ“ YOUR Multi-Head Attention (Module 12)") - console.print(" βœ“ YOUR Transformer blocks (Module 13)") - console.print("\n[bold]This is the foundation of ChatGPT, built by YOU from scratch![/bold]") - - -if __name__ == "__main__": - main() - diff --git a/milestones/05_2017_transformer/tinytalks_interactive.py b/milestones/05_2017_transformer/tinytalks_interactive.py deleted file mode 100644 index df80453f..00000000 --- a/milestones/05_2017_transformer/tinytalks_interactive.py +++ /dev/null @@ -1,427 +0,0 @@ -""" -TinyTalks Interactive Learning Dashboard -========================================= - -Watch a chatbot learn in real-time! - -Students can see: -- Loss decreasing over time -- Responses improving from gibberish to coherent -- Learning progress at multiple checkpoints -- Interactive control (pause/continue) -""" - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent.parent)) - -import numpy as np -import time -from tinytorch.core.tensor import Tensor -from tinytorch.core.autograd import enable_autograd -from tinytorch.core.optimizers import Adam -from tinytorch.core.losses import CrossEntropyLoss -from tinytorch.models.transformer import GPT -from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats - -enable_autograd() - -try: - from rich.console import Console - from rich.panel import Panel - from rich.table import Table - from rich.live import Live - from rich.layout import Layout - from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn - RICH_AVAILABLE = True -except ImportError: - RICH_AVAILABLE = False - print("Note: Install 'rich' for better visualization: pip install rich") - -# ============================================================================ -# Tokenization (copied from tinytalks_chatbot.py) -# ============================================================================ - -def create_tokenizer(conversations): - """Create character-level tokenizer with special tokens.""" - all_text = ' '.join([q + ' ' + a for q, a in conversations]) - all_chars = sorted(set(all_text)) - - special_tokens = { - '': 0, - '': 1, - '': 2, - '': 3, - } - - char_to_idx = {**special_tokens} - idx_to_char = {v: k for k, v in special_tokens.items()} - - for idx, char in enumerate(all_chars, start=len(special_tokens)): - char_to_idx[char] = idx - idx_to_char[idx] = char - - return char_to_idx, idx_to_char - - -def encode_conversation(question, answer, char_to_idx, max_len=80): - """Encode Q&A pair as: question answer ...""" - tokens = [char_to_idx['']] - - for c in question: - tokens.append(char_to_idx.get(c, 0)) - - tokens.append(char_to_idx['']) - - for c in answer: - tokens.append(char_to_idx.get(c, 0)) - - tokens.append(char_to_idx['']) - - if len(tokens) < max_len: - tokens = tokens + [char_to_idx['']] * (max_len - len(tokens)) - else: - tokens = tokens[:max_len] - - return tokens - - -def decode_tokens(tokens, idx_to_char): - """Decode tokens to string.""" - chars = [] - for t in tokens: - if t == 0 or t == 1: # PAD or SOS - continue - elif t == 2: # SEP - continue - elif t == 3: # EOS - break - else: - chars.append(idx_to_char.get(t, '?')) - return ''.join(chars) - - -def generate_response(model, question, char_to_idx, idx_to_char, max_len=50): - """Generate response to a question.""" - tokens = [char_to_idx['']] - for c in question: - tokens.append(char_to_idx.get(c, 0)) - tokens.append(char_to_idx['']) - - generated_tokens = [] - for _ in range(max_len): - input_tokens = tokens + generated_tokens - while len(input_tokens) < 80: - input_tokens.append(char_to_idx['']) - input_tokens = input_tokens[:80] - - x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False) - logits = model.forward(x) - - next_pos = len(tokens) + len(generated_tokens) - 1 - if next_pos < logits.shape[1]: - next_logits = logits.data[0, next_pos, :] - next_token = int(np.argmax(next_logits)) - - if next_token == char_to_idx[''] or next_token == char_to_idx['']: - break - - generated_tokens.append(next_token) - else: - break - - response = decode_tokens(generated_tokens, idx_to_char) - return response - - -# ============================================================================ -# Interactive Training with Checkpoints -# ============================================================================ - -def evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char): - """Evaluate model on test questions.""" - results = [] - for question in test_questions: - response = generate_response(model, question, char_to_idx, idx_to_char) - results.append((question, response)) - return results - - -def show_checkpoint_panel(checkpoint_num, step, loss, results, prev_results=None): - """Show checkpoint results in a nice panel.""" - if RICH_AVAILABLE: - console = Console() - - # Header - console.print() - console.print("=" * 70, style="bold cyan") - console.print(f"CHECKPOINT {checkpoint_num} - Step {step:,} | Loss: {loss:.4f}", - style="bold yellow", justify="center") - console.print("=" * 70, style="bold cyan") - console.print() - - # Show responses - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Question", style="cyan", width=25) - table.add_column("Response", style="green", width=35) - if prev_results: - table.add_column("Previous", style="dim", width=10) - - for i, (question, response) in enumerate(results): - if prev_results and i < len(prev_results): - prev_response = prev_results[i][1] - improved = "πŸ“ˆ" if len(response) > len(prev_response) else "πŸ“‰" - table.add_row(question, response, improved) - else: - table.add_row(question, response) - - console.print(table) - console.print() - else: - # Fallback to simple print - print() - print("=" * 70) - print(f"CHECKPOINT {checkpoint_num} - Step {step:,} | Loss: {loss:.4f}") - print("=" * 70) - print() - for question, response in results: - print(f"Q: {question}") - print(f"A: {response}") - print() - - -def train_interactive(model, optimizer, loss_fn, train_data, test_questions, - char_to_idx, idx_to_char, max_time_minutes=15, - checkpoint_steps=1000, auto_continue_seconds=10): - """ - Train with interactive checkpoints. - - Args: - checkpoint_steps: Pause every N steps to show results - auto_continue_seconds: Auto-continue after N seconds (0 = wait for ENTER) - """ - max_time_seconds = max_time_minutes * 60 - - print("=" * 70) - print(f"INTERACTIVE TRAINING - {max_time_minutes} MINUTES") - print("=" * 70) - print(f"Dataset: {len(train_data)} conversations") - print(f"Checkpoints: Every {checkpoint_steps} steps") - print(f"Auto-continue: {auto_continue_seconds}s (or press ENTER)") - print("=" * 70) - print() - print("Watch the model learn from gibberish to coherent responses!") - print() - - # Initial evaluation (before training) - print("Evaluating initial model (untrained)...") - initial_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char) - show_checkpoint_panel(0, 0, 999.9, initial_results) - - if auto_continue_seconds > 0: - print(f"Starting training in {auto_continue_seconds} seconds (or press ENTER)...") - time.sleep(auto_continue_seconds) - elif auto_continue_seconds == 0: - print("Starting training immediately...") - time.sleep(0.5) - else: - input("Press ENTER to start training...") - - print() - print("Training started...") - print() - - start_time = time.time() - losses = [] - step = 0 - checkpoint_num = 1 - prev_results = initial_results - - next_checkpoint = checkpoint_steps - - while True: - elapsed = time.time() - start_time - if elapsed >= max_time_seconds: - break - - # Training step - tokens = train_data[np.random.randint(len(train_data))] - input_seq = tokens[:-1] - target_seq = tokens[1:] - - x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False) - y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False) - - logits = model.forward(x) - - batch_size, seq_len, vocab_size = logits.shape - logits_flat = logits.reshape(batch_size * seq_len, vocab_size) - targets_flat = y_true.reshape(batch_size * seq_len) - loss = loss_fn.forward(logits_flat, targets_flat) - - optimizer.zero_grad() - loss.backward() - - for param in model.parameters(): - if param.grad is not None: - np.clip(param.grad, -1.0, 1.0, out=param.grad) - - optimizer.step() - - losses.append(loss.data.item()) - step += 1 - - # Show progress every 100 steps - if step % 100 == 0: - avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - print(f"[{int(elapsed):4d}s] Step {step:5d} | Loss: {avg_loss:.4f}") - - # Checkpoint evaluation - if step >= next_checkpoint: - avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - - print() - print(f"Evaluating at step {step}...") - current_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char) - - show_checkpoint_panel(checkpoint_num, step, avg_loss, current_results, prev_results) - - prev_results = current_results - checkpoint_num += 1 - next_checkpoint += checkpoint_steps - - # Interactive pause - if auto_continue_seconds > 0: - print(f"Continuing in {auto_continue_seconds}s (or press ENTER)...") - time.sleep(auto_continue_seconds) - elif auto_continue_seconds == 0: - print("Continuing immediately...") - time.sleep(0.5) - else: - input("Press ENTER to continue training...") - - print() - print("Training resumed...") - print() - - # Final results - final_elapsed = time.time() - start_time - final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses) - initial_loss = np.mean(losses[:10]) - improvement = (1 - final_loss / initial_loss) * 100 - - print() - print("=" * 70) - print("TRAINING COMPLETE!") - print("=" * 70) - print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.1f} minutes)") - print(f"Total steps: {step:,}") - print(f"Initial loss: {initial_loss:.4f}") - print(f"Final loss: {final_loss:.4f}") - print(f"Improvement: {improvement:.1f}%") - print() - - # Final evaluation - print("Final evaluation...") - final_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char) - show_checkpoint_panel("FINAL", step, final_loss, final_results, prev_results) - - return losses, step - - -# ============================================================================ -# Main -# ============================================================================ - -def main(): - print() - print("=" * 70) - print("TINYTALKS INTERACTIVE LEARNING DASHBOARD") - print("=" * 70) - print() - print("Watch a transformer learn to chat in real-time!") - print("You'll see responses improve from gibberish to coherent answers.") - print() - - # Dataset - conversations = create_tinytalks_dataset() - stats = get_dataset_stats() - - print(f"Dataset: {stats['total_examples']} examples ({stats['unique_examples']} unique)") - print() - - # Tokenizer - char_to_idx, idx_to_char = create_tokenizer(conversations) - vocab_size = len(idx_to_char) - - # Encode - max_seq_len = 80 - train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations] - - # Test questions for checkpoints - test_questions = [ - "Hi", - "How are you", - "What is your name", - "What is the sky", - "Is grass green", - ] - - # Model: Ultra-tiny for speed - config = { - 'vocab_size': vocab_size, - 'embed_dim': 16, - 'num_layers': 1, - 'num_heads': 2, - 'max_seq_len': max_seq_len, - } - - model = GPT(**config) - num_params = sum(np.prod(p.shape) for p in model.parameters()) - print(f"Model: {num_params:,} parameters") - print() - - # Optimizer - optimizer = Adam(model.parameters(), lr=0.001) - loss_fn = CrossEntropyLoss() - - # Settings - train_time = 5 # minutes (shorter for demo) - checkpoint_steps = 1000 # Evaluate every 1000 steps (~1-2 minutes) - auto_continue = 0 # Auto-continue immediately (0 = no wait for demo) - - print(f"Training for {train_time} minutes") - print(f"Checkpoints every {checkpoint_steps} steps") - print() - - # Train with interactive checkpoints - losses, total_steps = train_interactive( - model=model, - optimizer=optimizer, - loss_fn=loss_fn, - train_data=train_data, - test_questions=test_questions, - char_to_idx=char_to_idx, - idx_to_char=idx_to_char, - max_time_minutes=train_time, - checkpoint_steps=checkpoint_steps, - auto_continue_seconds=auto_continue - ) - - print() - print("=" * 70) - print("DEMO COMPLETE!") - print("=" * 70) - print() - print("You just watched a transformer learn from scratch!") - print(f"βœ“ {total_steps:,} training steps") - print(f"βœ“ {len(losses)} loss values") - print(f"βœ“ {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}% improvement") - print() - print("Key takeaway: Loss decrease = Better responses!") - print() - - -if __name__ == "__main__": - main() - diff --git a/milestones/05_2017_transformer/train_monitored.py b/milestones/05_2017_transformer/train_monitored.py deleted file mode 100755 index 3dd95495..00000000 --- a/milestones/05_2017_transformer/train_monitored.py +++ /dev/null @@ -1,336 +0,0 @@ -#!/usr/bin/env python3 -""" -Monitored Training Script for TinyTalks -======================================== - -Features: -- Early stopping if loss doesn't improve -- Continuous progress monitoring -- Automatic experiment termination for bad runs -- Clear feedback on learning progress - -Usage: - python train_monitored.py --mode test # 10 epochs, quick validation - python train_monitored.py --mode full # 30 epochs, full training -""" - -import sys -import os -import argparse -import time -import numpy as np -from pathlib import Path - -# Add project root to path -project_root = Path(__file__).parent.parent.parent -sys.path.insert(0, str(project_root)) - -from rich.console import Console -from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn -from rich.table import Table -from rich import box - -# Import TinyTorch components -from tinytorch.core.tensor import Tensor -from tinytorch.core.autograd import enable_autograd -from tinytorch.core.losses import CrossEntropyLoss -from tinytorch.core.optimizers import Adam -from tinytorch.text.tokenization import CharTokenizer - -console = Console() - -# Import TinyGPT and dataset classes -exec(open(project_root / "milestones/05_2017_transformer/tinytalks_gpt.py").read()) - - -class TrainingMonitor: - """Monitor training progress and implement early stopping""" - - def __init__(self, patience=5, min_delta=0.01): - """ - Args: - patience: Number of checks without improvement before stopping - min_delta: Minimum change in loss to count as improvement - """ - self.patience = patience - self.min_delta = min_delta - self.best_loss = float('inf') - self.checks_without_improvement = 0 - self.losses = [] - - def check(self, current_loss): - """ - Check if training should continue - - Returns: - (should_continue, message) - """ - self.losses.append(current_loss) - - # Calculate improvement - improvement = self.best_loss - current_loss - - if improvement > self.min_delta: - # Significant improvement - self.best_loss = current_loss - self.checks_without_improvement = 0 - return True, f"βœ“ Loss improved by {improvement:.4f}" - else: - # No significant improvement - self.checks_without_improvement += 1 - - if self.checks_without_improvement >= self.patience: - return False, f"βœ— No improvement for {self.patience} checks. Stopping." - else: - return True, f"⚠ No improvement ({self.checks_without_improvement}/{self.patience})" - - def summary(self): - """Get training summary""" - if len(self.losses) < 2: - return "Not enough data" - - initial = self.losses[0] - final = self.losses[-1] - best = min(self.losses) - decrease = initial - final - decrease_pct = (decrease / initial) * 100 if initial > 0 else 0 - - return { - 'initial_loss': initial, - 'final_loss': final, - 'best_loss': best, - 'total_decrease': decrease, - 'decrease_percent': decrease_pct, - 'num_checks': len(self.losses) - } - - -def train_with_monitoring(model, dataset, optimizer, criterion, config, monitor): - """ - Train with continuous monitoring and early stopping - - Args: - model: TinyGPT model - dataset: TinyTalksDataset - optimizer: Adam optimizer - criterion: CrossEntropyLoss - config: Training configuration dict - monitor: TrainingMonitor instance - - Returns: - success: True if training completed successfully - """ - epochs = config['epochs'] - batch_size = config['batch_size'] - check_interval = config.get('check_interval', 50) # Check every N batches - - console.print(f"\n[bold cyan]Starting Training with Monitoring[/bold cyan]") - console.print(f" Check interval: Every {check_interval} batches") - console.print(f" Early stopping: {monitor.patience} checks without improvement\n") - - total_batches_processed = 0 - start_time = time.time() - - for epoch in range(epochs): - epoch_start = time.time() - epoch_loss = 0.0 - batch_count = 0 - - console.print(f"[bold]Epoch {epoch+1}/{epochs}[/bold]") - - # Create batches - num_sequences = len(dataset) - indices = np.random.permutation(num_sequences) - - for batch_start in range(0, num_sequences, batch_size): - batch_end = min(batch_start + batch_size, num_sequences) - batch_indices = indices[batch_start:batch_end] - - # Get batch data - batch_inputs = [] - batch_targets = [] - for idx in batch_indices: - input_seq, target_seq = dataset[idx] - batch_inputs.append(input_seq) - batch_targets.append(target_seq) - - # Convert to tensors - batch_input = Tensor(np.array(batch_inputs)) - batch_target = Tensor(np.array(batch_targets)) - - # Forward pass - logits = model.forward(batch_input) - - # Reshape for loss - batch_size_actual, seq_length, vocab_size = logits.shape - logits_2d = logits.reshape(batch_size_actual * seq_length, vocab_size) - targets_1d = batch_target.reshape(-1) - - # Compute loss - loss = criterion.forward(logits_2d, targets_1d) - - # Backward and optimize - loss.backward() - optimizer.step() - optimizer.zero_grad() - - # Track loss - loss_value = float(loss.data) - epoch_loss += loss_value - batch_count += 1 - total_batches_processed += 1 - - # Monitor progress at check intervals - if total_batches_processed % check_interval == 0: - avg_loss = epoch_loss / batch_count - should_continue, message = monitor.check(avg_loss) - - elapsed = time.time() - start_time - console.print(f" Batch {total_batches_processed} | Loss: {avg_loss:.4f} | {message} | Time: {elapsed:.1f}s") - - if not should_continue: - console.print(f"\n[yellow]Early stopping triggered at epoch {epoch+1}, batch {batch_count}[/yellow]") - return False - - # Epoch summary - avg_epoch_loss = epoch_loss / batch_count - epoch_time = time.time() - epoch_start - console.print(f" β†’ Epoch {epoch+1} complete: Avg Loss = {avg_epoch_loss:.4f} | Time: {epoch_time:.1f}s\n") - - console.print(f"[green]βœ“ Training completed successfully![/green]\n") - return True - - -def main(): - parser = argparse.ArgumentParser(description='Monitored TinyTalks Training') - parser.add_argument('--mode', choices=['test', 'full'], default='test', - help='Training mode: test (10 epochs) or full (30 epochs)') - parser.add_argument('--patience', type=int, default=5, - help='Early stopping patience (checks without improvement)') - parser.add_argument('--min-delta', type=float, default=0.01, - help='Minimum loss decrease to count as improvement') - parser.add_argument('--check-interval', type=int, default=50, - help='Check progress every N batches') - - args = parser.parse_args() - - # Enable autograd - enable_autograd() - - # Configuration based on mode - if args.mode == 'test': - config = { - 'epochs': 10, - 'batch_size': 32, - 'lr': 0.001, - 'embed_dim': 128, - 'num_layers': 6, - 'num_heads': 8, - 'check_interval': args.check_interval, - 'mode': 'TEST (Quick Validation)' - } - else: # full - config = { - 'epochs': 30, - 'batch_size': 32, - 'lr': 0.001, - 'embed_dim': 128, - 'num_layers': 6, - 'num_heads': 8, - 'check_interval': args.check_interval, - 'mode': 'FULL (Complete Training)' - } - - # Display configuration - console.print("\n[bold cyan]═══════════════════════════════════════════════════[/bold cyan]") - console.print("[bold cyan] Monitored TinyTalks Training - Option C [/bold cyan]") - console.print("[bold cyan]═══════════════════════════════════════════════════[/bold cyan]\n") - - table = Table(box=box.ROUNDED) - table.add_column("Parameter", style="cyan") - table.add_column("Value", style="yellow") - - table.add_row("Mode", config['mode']) - table.add_row("Epochs", str(config['epochs'])) - table.add_row("Batch Size", str(config['batch_size'])) - table.add_row("Learning Rate", str(config['lr'])) - table.add_row("Model Size", f"{config['embed_dim']}d, {config['num_layers']}L, {config['num_heads']}H") - table.add_row("Early Stopping Patience", str(args.patience)) - table.add_row("Min Delta", str(args.min_delta)) - table.add_row("Check Interval", f"Every {args.check_interval} batches") - - console.print(table) - console.print() - - # Load dataset - console.print("[bold]Loading TinyTalks dataset...[/bold]") - dataset_path = project_root / "datasets/tinytalks/splits/train.txt" - with open(dataset_path, 'r') as f: - text = f.read() - - dataset = TinyTalksDataset(text, seq_length=64) - console.print(f" βœ“ Loaded: {len(text):,} chars, {dataset.tokenizer.vocab_size} vocab\n") - - # Initialize model - console.print("[bold]Initializing model...[/bold]") - model = TinyGPT( - vocab_size=dataset.tokenizer.vocab_size, - embed_dim=config['embed_dim'], - num_layers=config['num_layers'], - num_heads=config['num_heads'], - max_seq_len=64 - ) - - params = model.parameters() - param_count = sum(p.data.size for p in params) - console.print(f" βœ“ Model initialized: {param_count:,} parameters\n") - - # Initialize training components - optimizer = Adam(params, lr=config['lr']) - criterion = CrossEntropyLoss() - monitor = TrainingMonitor(patience=args.patience, min_delta=args.min_delta) - - # Train - console.print("[bold]Starting training...[/bold]\n") - start_time = time.time() - - success = train_with_monitoring(model, dataset, optimizer, criterion, config, monitor) - - total_time = time.time() - start_time - - # Summary - console.print("\n[bold cyan]═══════════════════════════════════════════════════[/bold cyan]") - console.print("[bold cyan] Training Summary [/bold cyan]") - console.print("[bold cyan]═══════════════════════════════════════════════════[/bold cyan]\n") - - summary = monitor.summary() - - result_table = Table(box=box.ROUNDED) - result_table.add_column("Metric", style="cyan") - result_table.add_column("Value", style="yellow") - - result_table.add_row("Status", "βœ“ SUCCESS" if success else "⚠ EARLY STOP") - result_table.add_row("Total Time", f"{total_time/60:.1f} minutes") - result_table.add_row("Initial Loss", f"{summary['initial_loss']:.4f}") - result_table.add_row("Final Loss", f"{summary['final_loss']:.4f}") - result_table.add_row("Best Loss", f"{summary['best_loss']:.4f}") - result_table.add_row("Total Decrease", f"{summary['total_decrease']:.4f} ({summary['decrease_percent']:.1f}%)") - result_table.add_row("Checks Performed", str(summary['num_checks'])) - - console.print(result_table) - console.print() - - # Recommendation - if success and summary['decrease_percent'] > 50: - console.print("[bold green]βœ“ EXCELLENT: Model is learning well! Continue with full training.[/bold green]") - elif success and summary['decrease_percent'] > 20: - console.print("[bold yellow]⚠ MODERATE: Model is learning but slowly. Consider tuning hyperparameters.[/bold yellow]") - elif success: - console.print("[bold red]βœ— POOR: Model not learning effectively. Needs hyperparameter adjustment.[/bold red]") - else: - console.print("[bold red]βœ— FAILED: Training stopped early. Try different hyperparameters.[/bold red]") - - -if __name__ == "__main__": - main() -