Remove non-Vaswani transformer examples

Keep only the three Vaswani examples that reference the 2017 Attention Is All You Need paper:
- vaswani_chatgpt.py (Q&A generation)
- vaswani_copilot.py (Python autocomplete)
- vaswani_shakespeare.py (text generation)

Removed 14 redundant example files
This commit is contained in:
Vijay Janapa Reddi
2025-11-05 09:15:17 -05:00
parent a49d4c3810
commit aa36fef9df
14 changed files with 0 additions and 5251 deletions

View File

@@ -1,75 +0,0 @@
#!/usr/bin/env python3
"""
Download and prepare TinyStories dataset for TinyTorch training.
TinyStories is a dataset of simple, synthetic stories designed for
training small language models. It's much easier than Shakespeare!
"""
import os
import urllib.request
def download_tinystories():
"""Download TinyStories dataset."""
# Create data directory
data_dir = os.path.join(os.path.dirname(__file__), '../datasets/tinystories')
os.makedirs(data_dir, exist_ok=True)
# TinyStories validation set (smaller, good for testing)
urls = {
'tiny_val': 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-valid.txt',
'tiny_train_small': 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories-train.txt'
}
print("📥 Downloading TinyStories dataset...")
print("="*70)
# Start with validation set (much smaller for testing)
filename = 'tinystories_val.txt'
filepath = os.path.join(data_dir, filename)
if os.path.exists(filepath):
print(f"{filename} already exists")
size = os.path.getsize(filepath) / (1024 * 1024)
print(f" Size: {size:.2f} MB")
else:
print(f"⬇️ Downloading {filename}...")
try:
urllib.request.urlretrieve(urls['tiny_val'], filepath)
size = os.path.getsize(filepath) / (1024 * 1024)
print(f"✅ Downloaded! Size: {size:.2f} MB")
except Exception as e:
print(f"❌ Error downloading: {e}")
print("\n💡 Alternative: Download manually from:")
print(f" {urls['tiny_val']}")
print(f" Save to: {filepath}")
return None
# Read and show sample
with open(filepath, 'r', encoding='utf-8') as f:
text = f.read()
print(f"\n📊 Dataset Stats:")
print(f" Total characters: {len(text):,}")
print(f" Total words: {len(text.split()):,}")
print(f" Unique characters: {len(set(text))}")
# Show first story
stories = text.split('<|endoftext|>')
if len(stories) > 0:
first_story = stories[0].strip()
print(f"\n📖 Sample Story:")
print(" " + "-"*66)
print(" " + first_story[:300].replace('\n', '\n '))
if len(first_story) > 300:
print(" ...")
print(" " + "-"*66)
print(f"\n✅ TinyStories ready for training!")
print(f" Location: {filepath}")
return filepath
if __name__ == '__main__':
download_tinystories()

View File

@@ -1,338 +0,0 @@
"""
Milestone 05 - Level 1: Transformer Memorization Test
======================================================
SIMPLEST POSSIBLE TRANSFORMER TEST:
Can the transformer memorize and reproduce simple sequences?
Task: Given "ABCD", predict "BCDE"
Given "1234", predict "2345"
Expected:
- Train in < 2 minutes
- Loss should drop from ~3.0 to < 0.1
- Should perfectly predict next character
This validates:
✓ Transformer architecture works
✓ Attention mechanism works
✓ Gradient flow works
✓ Training loop works
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import numpy as np
import time
from tinytorch.core.tensor import Tensor
from tinytorch.core.autograd import enable_autograd
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.models.transformer import GPT
enable_autograd()
# ============================================================================
# Level 1: Simple Memorization Dataset
# ============================================================================
def create_memorization_dataset():
"""
Create ultra-simple sequences to memorize:
- Alphabet sequences: ABCD, EFGH, etc.
- Number sequences: 1234, 5678, etc.
- Pattern sequences: AAAA, BBBB, etc.
"""
sequences = [
# Alphabet
"ABCDE",
"FGHIJ",
"KLMNO",
"PQRST",
"UVWXY",
# Numbers
"12345",
"67890",
# Patterns
"AAAAA",
"BBBBB",
"CCCCC",
# Mixed
"A1B2C",
"X9Y8Z",
]
return sequences
def create_simple_tokenizer(sequences):
"""Create character-level tokenizer for sequences."""
# Get all unique characters
all_chars = sorted(set(''.join(sequences)))
# Create mappings (0 is reserved for padding)
char_to_idx = {char: idx + 1 for idx, char in enumerate(all_chars)}
idx_to_char = {idx + 1: char for idx, char in enumerate(all_chars)}
char_to_idx['<PAD>'] = 0
idx_to_char[0] = '<PAD>'
return char_to_idx, idx_to_char
def encode_sequence(seq, char_to_idx, max_len=8):
"""Encode sequence to token IDs."""
tokens = [char_to_idx.get(c, 0) for c in seq]
# Pad to max_len
if len(tokens) < max_len:
tokens = tokens + [0] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
return tokens
def decode_sequence(tokens, idx_to_char):
"""Decode token IDs to string."""
chars = [idx_to_char.get(t, '') for t in tokens if t != 0]
return ''.join(chars)
# ============================================================================
# Training
# ============================================================================
def train_memorization(model, optimizer, loss_fn, train_data, vocab_size, max_steps=200):
"""
Train transformer to memorize sequences.
Target: < 2 minutes, loss < 0.1
"""
print("=" * 70)
print("TRAINING LEVEL 1: MEMORIZATION")
print("=" * 70)
print(f"Dataset: {len(train_data)} sequences")
print(f"Vocab size: {vocab_size}")
print(f"Max steps: {max_steps}")
print(f"Target: Loss < 0.1 in < 2 minutes")
print()
start_time = time.time()
losses = []
for step in range(max_steps):
# Sample random sequence
tokens = train_data[np.random.randint(len(train_data))]
# Input: all but last token
# Target: all but first token (next token prediction)
input_seq = tokens[:-1]
target_seq = tokens[1:]
# Convert to tensors
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
# Forward pass
logits = model.forward(x)
# Compute loss
batch_size, seq_len, vocab_size_out = logits.shape
logits_flat = logits.reshape(batch_size * seq_len, vocab_size_out)
targets_flat = y_true.reshape(batch_size * seq_len)
loss = loss_fn.forward(logits_flat, targets_flat)
# Backward pass
optimizer.zero_grad()
loss.backward()
# Clip gradients
for param in model.parameters():
if param.grad is not None:
np.clip(param.grad, -1.0, 1.0, out=param.grad)
# Update
optimizer.step()
losses.append(loss.data.item())
# Progress every 50 steps
if step % 50 == 0:
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
elapsed = time.time() - start_time
print(f"Step {step:4d}/{max_steps} | Loss: {avg_loss:.4f} | Time: {elapsed:.1f}s")
# Early stopping
if avg_loss < 0.2:
print(f"\n✓ Target reached! Loss < 0.2 at step {step}")
break
elapsed = time.time() - start_time
final_loss = np.mean(losses[-100:])
initial_loss = np.mean(losses[:10])
improvement = (1 - final_loss / initial_loss) * 100
print()
print("=" * 70)
print("TRAINING COMPLETE")
print("=" * 70)
print(f"Time: {elapsed:.1f} seconds")
print(f"Initial loss: {initial_loss:.4f}")
print(f"Final loss: {final_loss:.4f}")
print(f"Improvement: {improvement:.1f}%")
print()
return losses
# ============================================================================
# Testing
# ============================================================================
def test_memorization(model, test_sequences, char_to_idx, idx_to_char):
"""
Test if model can reproduce memorized sequences.
"""
print("=" * 70)
print("TESTING LEVEL 1: MEMORIZATION")
print("=" * 70)
print()
correct = 0
total = len(test_sequences)
for seq in test_sequences:
# Encode
tokens = encode_sequence(seq, char_to_idx, max_len=8)
# Get model predictions
x = Tensor(np.array([tokens[:-1]], dtype=np.int32), requires_grad=False)
logits = model.forward(x)
# Decode predictions (greedy)
predicted_tokens = []
for i in range(logits.shape[1]):
next_token = int(np.argmax(logits.data[0, i, :]))
predicted_tokens.append(next_token)
# Compare
expected = tokens[1:] # Target sequence
predicted = predicted_tokens
# Check if match (ignoring padding)
match = True
for exp, pred in zip(expected, predicted):
if exp == 0: # Padding, stop checking
break
if exp != pred:
match = False
break
if match:
correct += 1
status = ""
else:
status = ""
# Decode for display
expected_str = decode_sequence(expected, idx_to_char)
predicted_str = decode_sequence(predicted, idx_to_char)
print(f"{status} Input: {seq[:4]:8s} → Expected: {expected_str:8s} | Got: {predicted_str:8s}")
accuracy = (correct / total) * 100
print()
print(f"Accuracy: {correct}/{total} ({accuracy:.1f}%)")
print()
if accuracy >= 90:
print("✓ LEVEL 1 PASSED: Transformer can memorize sequences!")
else:
print("✗ LEVEL 1 FAILED: Needs more training or debugging")
return accuracy
# ============================================================================
# Main
# ============================================================================
def main():
print()
print("=" * 70)
print("MILESTONE 05 - LEVEL 1: TRANSFORMER MEMORIZATION TEST")
print("=" * 70)
print()
print("Goal: Train transformer to memorize simple sequences in < 2 minutes")
print()
# Create dataset
sequences = create_memorization_dataset()
char_to_idx, idx_to_char = create_simple_tokenizer(sequences)
vocab_size = len(idx_to_char)
print(f"Dataset: {len(sequences)} sequences")
print(f"Vocabulary: {vocab_size} tokens")
print(f"Example: {sequences[0]}{encode_sequence(sequences[0], char_to_idx)}")
print()
# Encode all sequences
train_data = [encode_sequence(seq, char_to_idx, max_len=8) for seq in sequences]
# Create ULTRA-tiny model for speed
config = {
'vocab_size': vocab_size,
'embed_dim': 16, # Super tiny!
'num_layers': 1, # Just 1 layer
'num_heads': 2, # 2 heads
'max_seq_len': 8, # Short sequences
}
print("Model configuration:")
for key, val in config.items():
print(f" {key}: {val}")
print()
model = GPT(**config)
num_params = sum(np.prod(p.shape) for p in model.parameters())
print(f"Parameters: {num_params:,}")
print()
# Optimizer and loss
optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = CrossEntropyLoss()
# Train
print("Starting training...")
print()
losses = train_memorization(
model=model,
optimizer=optimizer,
loss_fn=loss_fn,
train_data=train_data,
vocab_size=vocab_size,
max_steps=200 # Reduced for speed (ultra-tiny model)
)
# Test
print("Starting testing...")
print()
accuracy = test_memorization(model, sequences, char_to_idx, idx_to_char)
# Summary
print("=" * 70)
print("LEVEL 1 SUMMARY")
print("=" * 70)
print(f"✓ Training: {len(losses)} steps")
print(f"✓ Loss: {np.mean(losses[:10]):.4f}{np.mean(losses[-100:]):.4f}")
print(f"✓ Accuracy: {accuracy:.1f}%")
print()
if accuracy >= 90:
print("🎉 LEVEL 1 COMPLETE! Ready for Level 2: Pattern Completion")
else:
print("⚠️ LEVEL 1 INCOMPLETE: Needs debugging")
print()
if __name__ == "__main__":
main()

View File

@@ -1,357 +0,0 @@
"""
Milestone 05 - Level 2: Transformer Pattern Completion
=======================================================
SIMPLE PATTERN COMPLETION TEST:
Can the transformer learn to complete simple patterns?
Task: Given "A B C", predict "D"
Given "1 2 3", predict "4"
Given "do re mi", predict "fa"
Expected:
- Train in < 5 minutes
- Loss should drop from ~3.0 to < 0.5
- Should complete 70%+ of patterns correctly
This validates:
✓ Transformer can learn relationships
✓ Attention mechanism captures patterns
✓ Model generalizes beyond memorization
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import numpy as np
import time
from tinytorch.core.tensor import Tensor
from tinytorch.core.autograd import enable_autograd
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.models.transformer import GPT
enable_autograd()
# ============================================================================
# Level 2: Pattern Completion Dataset
# ============================================================================
def create_pattern_dataset():
"""
Create simple completion patterns:
- Sequences: A B C → D
- Counting: 1 2 3 → 4
- Musical: do re mi → fa
"""
patterns = [
# Alphabet sequences
("A B C", "D"),
("D E F", "G"),
("M N O", "P"),
("W X Y", "Z"),
# Numbers
("1 2 3", "4"),
("5 6 7", "8"),
# Words (short)
("cat dog", "rat"),
("up down", "left"),
# Repetition
("A A A", "A"),
("B B B", "B"),
("1 1 1", "1"),
]
return patterns
def create_tokenizer(patterns):
"""Create character-level tokenizer."""
# Get all unique characters
all_text = ' '.join([p[0] + ' ' + p[1] for p in patterns])
all_chars = sorted(set(all_text))
# Create mappings (0 = padding, 1 = EOS)
char_to_idx = {char: idx + 2 for idx, char in enumerate(all_chars)}
idx_to_char = {idx + 2: char for idx, char in enumerate(all_chars)}
char_to_idx['<PAD>'] = 0
char_to_idx['<EOS>'] = 1
idx_to_char[0] = '<PAD>'
idx_to_char[1] = '<EOS>'
return char_to_idx, idx_to_char
def encode_pattern(input_str, target_str, char_to_idx, max_len=16):
"""Encode pattern as: input + <EOS> + target + <EOS>, then pad."""
# Encode input
input_tokens = [char_to_idx.get(c, 0) for c in input_str]
input_tokens.append(1) # EOS
# Encode target
target_tokens = [char_to_idx.get(c, 0) for c in target_str]
target_tokens.append(1) # EOS
# Combine
tokens = input_tokens + target_tokens
# Pad
if len(tokens) < max_len:
tokens = tokens + [0] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
return tokens
def decode_tokens(tokens, idx_to_char):
"""Decode tokens to string."""
chars = []
for t in tokens:
if t == 0: # padding
break
if t == 1: # EOS
break
chars.append(idx_to_char.get(t, '?'))
return ''.join(chars)
# ============================================================================
# Training
# ============================================================================
def train_patterns(model, optimizer, loss_fn, train_data, vocab_size, max_steps=400):
"""
Train transformer to complete patterns.
Target: < 5 minutes, loss < 0.5
"""
print("=" * 70)
print("TRAINING LEVEL 2: PATTERN COMPLETION")
print("=" * 70)
print(f"Dataset: {len(train_data)} patterns")
print(f"Vocab size: {vocab_size}")
print(f"Max steps: {max_steps}")
print(f"Target: Loss < 0.5 in < 5 minutes")
print()
start_time = time.time()
losses = []
for step in range(max_steps):
# Sample random pattern
tokens = train_data[np.random.randint(len(train_data))]
# Input: all but last
# Target: all but first (shifted by 1)
input_seq = tokens[:-1]
target_seq = tokens[1:]
# Convert to tensors
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
# Forward pass
logits = model.forward(x)
# Compute loss
batch_size, seq_len, vocab_size_out = logits.shape
logits_flat = logits.reshape(batch_size * seq_len, vocab_size_out)
targets_flat = y_true.reshape(batch_size * seq_len)
loss = loss_fn.forward(logits_flat, targets_flat)
# Backward pass
optimizer.zero_grad()
loss.backward()
# Clip gradients
for param in model.parameters():
if param.grad is not None:
np.clip(param.grad, -1.0, 1.0, out=param.grad)
# Update
optimizer.step()
losses.append(loss.data.item())
# Progress every 50 steps
if step % 50 == 0 or step == max_steps - 1:
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
elapsed = time.time() - start_time
print(f"Step {step:4d}/{max_steps} | Loss: {avg_loss:.4f} | Time: {elapsed:.1f}s")
# Early stopping
if avg_loss < 0.5:
print(f"\n✓ Target reached! Loss < 0.5 at step {step}")
break
elapsed = time.time() - start_time
final_loss = np.mean(losses[-100:])
initial_loss = np.mean(losses[:10])
improvement = (1 - final_loss / initial_loss) * 100
print()
print("=" * 70)
print("TRAINING COMPLETE")
print("=" * 70)
print(f"Time: {elapsed:.1f} seconds")
print(f"Initial loss: {initial_loss:.4f}")
print(f"Final loss: {final_loss:.4f}")
print(f"Improvement: {improvement:.1f}%")
print()
return losses
# ============================================================================
# Testing
# ============================================================================
def test_patterns(model, test_patterns, char_to_idx, idx_to_char, max_len=16):
"""
Test if model can complete patterns.
"""
print("=" * 70)
print("TESTING LEVEL 2: PATTERN COMPLETION")
print("=" * 70)
print()
correct = 0
total = len(test_patterns)
for input_str, expected_target in test_patterns:
# Encode input + EOS
input_tokens = [char_to_idx.get(c, 0) for c in input_str]
input_tokens.append(1) # EOS
# Pad to max_len-1 (leave room for generation)
while len(input_tokens) < max_len - 1:
input_tokens.append(0)
input_tokens = input_tokens[:max_len-1]
# Forward pass
x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
logits = model.forward(x)
# Get prediction for next token (after input + EOS)
input_len = len([c for c in input_str]) + 1 # +1 for EOS
if input_len < len(input_tokens):
next_token_logits = logits.data[0, input_len - 1, :] # Predict position after EOS
predicted_token = int(np.argmax(next_token_logits))
# Decode
predicted_char = idx_to_char.get(predicted_token, '?')
# Check if correct (compare first character of target)
expected_first_char = expected_target[0] if len(expected_target) > 0 else ''
match = (predicted_char == expected_first_char)
else:
match = False
predicted_char = '?'
if match:
correct += 1
status = ""
else:
status = ""
print(f"{status} Input: \"{input_str:12s}\" → Expected: \"{expected_target:6s}\" | Got: \"{predicted_char}\"")
accuracy = (correct / total) * 100
print()
print(f"Accuracy: {correct}/{total} ({accuracy:.1f}%)")
print()
if accuracy >= 70:
print("✓ LEVEL 2 PASSED: Transformer can complete patterns!")
else:
print("✗ LEVEL 2 FAILED: Needs more training")
return accuracy
# ============================================================================
# Main
# ============================================================================
def main():
print()
print("=" * 70)
print("MILESTONE 05 - LEVEL 2: TRANSFORMER PATTERN COMPLETION")
print("=" * 70)
print()
print("Goal: Train transformer to complete patterns in < 5 minutes")
print()
# Create dataset
patterns = create_pattern_dataset()
char_to_idx, idx_to_char = create_tokenizer(patterns)
vocab_size = len(idx_to_char)
print(f"Dataset: {len(patterns)} patterns")
print(f"Vocabulary: {vocab_size} tokens")
print(f"Example: \"{patterns[0][0]}\"\"{patterns[0][1]}\"")
print()
# Encode all patterns
max_len = 16
train_data = [encode_pattern(inp, out, char_to_idx, max_len) for inp, out in patterns]
# Create small model (bigger than Level 1)
config = {
'vocab_size': vocab_size,
'embed_dim': 24, # Slightly bigger
'num_layers': 2, # 2 layers
'num_heads': 2, # 2 heads
'max_seq_len': max_len,
}
print("Model configuration:")
for key, val in config.items():
print(f" {key}: {val}")
print()
model = GPT(**config)
num_params = sum(np.prod(p.shape) for p in model.parameters())
print(f"Parameters: {num_params:,}")
print()
# Optimizer and loss
optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = CrossEntropyLoss()
# Train
print("Starting training...")
print()
losses = train_patterns(
model=model,
optimizer=optimizer,
loss_fn=loss_fn,
train_data=train_data,
vocab_size=vocab_size,
max_steps=400
)
# Test
print("Starting testing...")
print()
accuracy = test_patterns(model, patterns, char_to_idx, idx_to_char, max_len)
# Summary
print("=" * 70)
print("LEVEL 2 SUMMARY")
print("=" * 70)
print(f"✓ Training: {len(losses)} steps")
print(f"✓ Loss: {np.mean(losses[:10]):.4f}{np.mean(losses[-100:]):.4f}")
print(f"✓ Accuracy: {accuracy:.1f}%")
print()
if accuracy >= 70:
print("🎉 LEVEL 2 COMPLETE! Ready for Level 3: Text Generation")
else:
print("⚠️ LEVEL 2 INCOMPLETE: Needs more training")
print()
if __name__ == "__main__":
main()

View File

@@ -1,109 +0,0 @@
"""
Simple GPT model for CodeBot milestone - bypasses LayerNorm gradient bug.
This is a workaround for the milestone until core Tensor operations
(subtraction, mean) are fixed to maintain gradient flow.
"""
import numpy as np
from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Linear
from tinytorch.core.attention import MultiHeadAttention
from tinytorch.core.activations import GELU
from tinytorch.text.embeddings import Embedding
class SimpleGPT:
"""
Simplified GPT without LayerNorm (workaround for gradient flow bugs).
Architecture:
- Token + Position embeddings
- N transformer blocks (attention + MLP, NO LayerNorm)
- Output projection to vocabulary
Note: This is a temporary solution for the milestone. The full GPT
with LayerNorm requires fixes to core Tensor subtraction/mean operations.
"""
def __init__(
self,
vocab_size: int,
embed_dim: int,
num_layers: int,
num_heads: int,
max_seq_len: int,
mlp_ratio: int = 4
):
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
self.max_seq_len = max_seq_len
# Embeddings
self.token_embedding = Embedding(vocab_size, embed_dim)
self.position_embedding = Embedding(max_seq_len, embed_dim)
# Transformer blocks (simplified - no LayerNorm)
self.blocks = []
for _ in range(num_layers):
block = {
'attention': MultiHeadAttention(embed_dim, num_heads),
'mlp_fc1': Linear(embed_dim, embed_dim * mlp_ratio),
'mlp_gelu': GELU(), # Use tinytorch's GELU
'mlp_fc2': Linear(embed_dim * mlp_ratio, embed_dim),
}
self.blocks.append(block)
# Output projection
self.lm_head = Linear(embed_dim, vocab_size)
def forward(self, tokens: Tensor) -> Tensor:
"""
Forward pass through simplified GPT.
Args:
tokens: Token indices, shape (batch_size, seq_len)
Returns:
logits: Predictions, shape (batch_size, seq_len, vocab_size)
"""
batch_size, seq_len = tokens.shape
# Embeddings
token_emb = self.token_embedding.forward(tokens)
positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
pos_emb = self.position_embedding.forward(positions)
x = token_emb + pos_emb # (batch, seq, embed)
# Transformer blocks
for block in self.blocks:
# Self-attention with residual
attn_out = block['attention'].forward(x)
x = x + attn_out # Residual connection
# MLP with residual
mlp_out = block['mlp_fc1'].forward(x)
mlp_out = block['mlp_gelu'].forward(mlp_out) # Activation
mlp_out = block['mlp_fc2'].forward(mlp_out)
x = x + mlp_out # Residual connection
# Project to vocabulary
logits = self.lm_head.forward(x)
return logits
def parameters(self):
"""Return all trainable parameters."""
params = []
params.extend(self.token_embedding.parameters())
params.extend(self.position_embedding.parameters())
for block in self.blocks:
params.extend(block['attention'].parameters())
params.extend(block['mlp_fc1'].parameters())
params.extend(block['mlp_fc2'].parameters())
params.extend(self.lm_head.parameters())
return params

View File

@@ -1,316 +0,0 @@
"""
Milestone 05 - 5-Minute Training Test
======================================
GOAL: Train the best possible transformer in exactly 5 minutes.
We'll optimize for:
- Maximum learning in 5 minutes
- Clear progress visualization
- Actual generation testing
- Student-friendly output
This will show what's realistically achievable in a classroom demo.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import numpy as np
import time
from tinytorch.core.tensor import Tensor
from tinytorch.core.autograd import enable_autograd
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.models.transformer import GPT
enable_autograd()
# ============================================================================
# Dataset: Mix of memorization + patterns
# ============================================================================
def create_dataset():
"""Create a diverse but simple dataset."""
sequences = [
# Easy memorization
"AAAA", "BBBB", "CCCC", "1111", "2222",
# Simple sequences
"ABCD", "EFGH", "IJKL", "MNOP", "QRST",
"1234", "5678", "9012",
# Patterns (with repetition for learning)
"AB", "CD", "EF", "GH",
"12", "34", "56", "78",
] * 3 # Triple the dataset for better learning
return sequences
def create_tokenizer(sequences):
"""Simple character tokenizer."""
all_chars = sorted(set(''.join(sequences)))
char_to_idx = {char: idx + 1 for idx, char in enumerate(all_chars)}
idx_to_char = {idx + 1: char for idx, char in enumerate(all_chars)}
char_to_idx['<PAD>'] = 0
idx_to_char[0] = '<PAD>'
return char_to_idx, idx_to_char
def encode(seq, char_to_idx, max_len=10):
"""Encode and pad sequence."""
tokens = [char_to_idx.get(c, 0) for c in seq]
if len(tokens) < max_len:
tokens = tokens + [0] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
return tokens
def decode(tokens, idx_to_char):
"""Decode tokens to string."""
return ''.join([idx_to_char.get(t, '') for t in tokens if t != 0])
# ============================================================================
# Training with 5-minute time limit
# ============================================================================
def train_5_minutes(model, optimizer, loss_fn, train_data, max_time_seconds=300):
"""
Train for exactly 5 minutes, show progress throughout.
"""
print("=" * 70)
print("TRAINING FOR 5 MINUTES")
print("=" * 70)
print(f"Dataset: {len(train_data)} sequences")
print(f"Time limit: {max_time_seconds}s ({max_time_seconds/60:.1f} minutes)")
print()
start_time = time.time()
losses = []
step = 0
# Progress checkpoints at 1, 2, 3, 4, 5 minutes
checkpoints = [60, 120, 180, 240, 300]
checkpoint_idx = 0
print("Training started...")
print()
while True:
# Check time limit
elapsed = time.time() - start_time
if elapsed >= max_time_seconds:
break
# Sample random sequence
tokens = train_data[np.random.randint(len(train_data))]
# Next token prediction
input_seq = tokens[:-1]
target_seq = tokens[1:]
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
# Forward
logits = model.forward(x)
# Loss
batch_size, seq_len, vocab_size = logits.shape
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
targets_flat = y_true.reshape(batch_size * seq_len)
loss = loss_fn.forward(logits_flat, targets_flat)
# Backward
optimizer.zero_grad()
loss.backward()
# Clip gradients
for param in model.parameters():
if param.grad is not None:
np.clip(param.grad, -1.0, 1.0, out=param.grad)
# Update
optimizer.step()
losses.append(loss.data.item())
step += 1
# Show progress at checkpoints
if checkpoint_idx < len(checkpoints) and elapsed >= checkpoints[checkpoint_idx]:
avg_loss = np.mean(losses[-50:]) if len(losses) >= 50 else np.mean(losses)
steps_per_sec = step / elapsed
print(f"[{int(elapsed):3d}s] Step {step:4d} | Loss: {avg_loss:.4f} | Speed: {steps_per_sec:.2f} steps/sec")
checkpoint_idx += 1
# Also show every 50 steps if we're going fast
if step % 50 == 0:
if checkpoint_idx == 0 or elapsed < checkpoints[0]: # Only if we haven't hit first checkpoint
avg_loss = np.mean(losses[-50:]) if len(losses) >= 50 else np.mean(losses)
print(f"[{int(elapsed):3d}s] Step {step:4d} | Loss: {avg_loss:.4f}")
final_elapsed = time.time() - start_time
final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
initial_loss = np.mean(losses[:10])
improvement = (1 - final_loss / initial_loss) * 100
print()
print("=" * 70)
print("TRAINING COMPLETE")
print("=" * 70)
print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.2f} minutes)")
print(f"Total steps: {step}")
print(f"Steps/second: {step/final_elapsed:.2f}")
print(f"Initial loss: {initial_loss:.4f}")
print(f"Final loss: {final_loss:.4f}")
print(f"Improvement: {improvement:.1f}%")
print()
return losses, step
# ============================================================================
# Testing
# ============================================================================
def test_generation(model, test_sequences, char_to_idx, idx_to_char):
"""Test generation quality."""
print("=" * 70)
print("TESTING GENERATION")
print("=" * 70)
print()
correct = 0
total = len(test_sequences)
for seq in test_sequences[:15]: # Test first 15
tokens = encode(seq, char_to_idx, max_len=10)
# Get predictions
x = Tensor(np.array([tokens[:-1]], dtype=np.int32), requires_grad=False)
logits = model.forward(x)
# Predict each position
predicted_tokens = []
for i in range(logits.shape[1]):
pred = int(np.argmax(logits.data[0, i, :]))
predicted_tokens.append(pred)
# Compare
expected = tokens[1:]
match = all(e == p for e, p in zip(expected, predicted_tokens) if e != 0)
if match:
correct += 1
status = ""
else:
status = ""
expected_str = decode(expected, idx_to_char)
predicted_str = decode(predicted_tokens, idx_to_char)
print(f"{status} Input: {seq[:6]:8s} → Expected: {expected_str:8s} | Got: {predicted_str:8s}")
accuracy = (correct / 15) * 100 # Out of 15 tested
print()
print(f"Accuracy: {correct}/15 ({accuracy:.1f}%)")
print()
return accuracy
# ============================================================================
# Main
# ============================================================================
def main():
print()
print("=" * 70)
print("MILESTONE 05 - 5-MINUTE TRAINING TEST")
print("=" * 70)
print()
print("Let's find out what we can learn in exactly 5 minutes!")
print()
# Dataset
sequences = create_dataset()
char_to_idx, idx_to_char = create_tokenizer(sequences)
vocab_size = len(idx_to_char)
print(f"Dataset: {len(sequences)} sequences (with repetition)")
print(f"Unique sequences: {len(set(sequences))}")
print(f"Vocabulary: {vocab_size} tokens")
print()
# Encode
train_data = [encode(seq, char_to_idx, max_len=10) for seq in sequences]
# Model: Ultra-tiny for maximum steps in 5 mins
# Goal: <1s per step → ~300+ steps in 5 mins
# Strategy: Minimize params for speed
config = {
'vocab_size': vocab_size,
'embed_dim': 16, # Very small
'num_layers': 1, # Just 1 layer!
'num_heads': 2, # 2 heads
'max_seq_len': 10,
}
print("Model configuration:")
for key, val in config.items():
print(f" {key}: {val}")
print()
model = GPT(**config)
num_params = sum(np.prod(p.shape) for p in model.parameters())
print(f"Parameters: {num_params:,}")
print()
# Optimizer
optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = CrossEntropyLoss()
# Train for 5 minutes
print("Starting 5-minute training run...")
print("(Progress will be shown every minute)")
print()
losses, total_steps = train_5_minutes(
model=model,
optimizer=optimizer,
loss_fn=loss_fn,
train_data=train_data,
max_time_seconds=300 # 5 minutes
)
# Test
print("Testing what the model learned...")
print()
accuracy = test_generation(model, sequences, char_to_idx, idx_to_char)
# Final summary
print("=" * 70)
print("5-MINUTE TRAINING SUMMARY")
print("=" * 70)
print(f"✓ Model: {num_params:,} parameters")
print(f"✓ Steps completed: {total_steps}")
print(f"✓ Loss: {np.mean(losses[:10]):.4f}{np.mean(losses[-100:]):.4f}")
print(f"✓ Improvement: {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}%")
print(f"✓ Accuracy: {accuracy:.1f}%")
print()
if accuracy >= 60:
print("🎉 EXCELLENT! Model learned well in 5 minutes!")
elif accuracy >= 40:
print("✓ GOOD! Model is learning, could use more training.")
elif accuracy >= 20:
print("⚠️ FAIR: Model is learning but needs optimization.")
else:
print("⚠️ Model needs more training time or tuning.")
print()
if __name__ == "__main__":
main()

View File

@@ -1,744 +0,0 @@
#!/usr/bin/env python3
"""
Progressive Test Suite for TinyGPT Learning
Tests transformer learning from absolute simplest to complex:
0. Memorize single sequence (MUST work)
1. Pattern completion (A B A → B)
2. Copy task (COPY: X → X)
3. Simple arithmetic (2+3 → 5)
4. TinyTalks greetings
This helps identify EXACTLY where learning breaks down.
"""
import sys
import os
import numpy as np
import time
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(project_root)
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich import box
console = Console()
def run_test_0_memorize_sequence():
"""
TEST 0: Memorize Single Sequence
The ABSOLUTE simplest test. Can the model memorize ONE sequence?
"HELLO WORLD" repeated many times.
If this fails, there's a fundamental bug in:
- Forward pass
- Loss computation
- Backward pass
- Parameter updates
"""
console.print("\n" + "=" * 70)
console.print("[bold cyan]TEST 0: Single Sequence Memorization[/bold cyan]")
console.print("=" * 70)
console.print("Task: Memorize 'HELLO WORLD' (repeated 100 times)")
console.print("Expected: Loss should drop to near 0")
console.print("Why: If this fails, autograd/optimizer is broken\n")
from tinytorch.core.tensor import Tensor
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.core.autograd import enable_autograd
from tinytorch.text.tokenization import CharTokenizer
from tinytorch.text.embeddings import Embedding, PositionalEncoding
from tinytorch.models.transformer import TransformerBlock, LayerNorm
from tinytorch.core.layers import Linear
enable_autograd()
# Super simple data: just repeat "HELLO WORLD"
text = "HELLO WORLD " * 100
# Tokenize
tokenizer = CharTokenizer()
tokenizer.build_vocab([text])
data = tokenizer.encode(text)
console.print(f"Data length: {len(data)} tokens")
console.print(f"Vocabulary: {tokenizer.vocab_size} chars")
console.print(f"Unique text: '{text[:50]}...'\n")
# Tiny model
vocab_size = tokenizer.vocab_size
embed_dim = 32
seq_len = 16
# Build minimal model
embedding = Embedding(vocab_size, embed_dim)
pos_enc = PositionalEncoding(seq_len, embed_dim)
transformer = TransformerBlock(embed_dim, num_heads=2, mlp_ratio=2, dropout_prob=0.1)
ln = LayerNorm(embed_dim)
output_proj = Linear(embed_dim, vocab_size)
params = []
params.extend(embedding.parameters())
params.extend(pos_enc.parameters())
params.extend(transformer.parameters())
params.extend(ln.parameters())
params.extend(output_proj.parameters())
for p in params:
p.requires_grad = True
console.print(f"Model: {len(params)} parameter tensors")
console.print(f"Embed dim: {embed_dim}, Seq len: {seq_len}\n")
# Train
optimizer = Adam(params, lr=0.01)
criterion = CrossEntropyLoss()
console.print("[yellow]Training (10 steps)...[/yellow]")
console.print("[dim]Watching for: loss decrease, gradient flow, parameter updates[/dim]\n")
initial_loss = None
final_loss = None
for step in range(10):
# Random sequence
start = np.random.randint(0, len(data) - seq_len - 1)
input_seq = data[start:start+seq_len]
target_seq = data[start+1:start+seq_len+1]
console.print(f"[dim]Step {step+1}:[/dim]", end=" ")
# Forward
x = Tensor(np.array([input_seq]))
y = Tensor(np.array([target_seq]))
console.print(f"input shape={x.shape}", end=" ")
# Through model
x = embedding(x)
console.print(f"embed_out={x.shape}", end=" ")
x = pos_enc(x)
console.print(f"pos_out={x.shape}", end=" ")
x = transformer(x)
console.print(f"trans_out={x.shape}", end=" ")
x = ln(x)
console.print(f"ln_out={x.shape}", end=" ")
# Reshape
batch, seq, dim = x.shape
x_2d = x.reshape(batch * seq, dim)
logits_2d = output_proj(x_2d)
logits = logits_2d.reshape(batch, seq, vocab_size)
console.print(f"logits={logits.shape}", end=" ")
# Loss
logits_flat = logits.reshape(batch * seq, vocab_size)
targets_flat = y.reshape(-1)
console.print(f"logits_flat={logits_flat.shape} targets_flat={targets_flat.shape}", end=" ")
loss = criterion(logits_flat, targets_flat)
loss_val = float(loss.data)
console.print(f"loss={loss_val:.4f}", end=" ")
# Check if loss has grad_fn
has_grad_fn = hasattr(loss, '_grad_fn') and loss._grad_fn is not None
console.print(f"has_grad_fn={has_grad_fn}", end=" ")
# Backward
optimizer.zero_grad()
console.print("backward...", end=" ")
loss.backward()
# Check if params got gradients
params_with_grad = sum(1 for p in params if p.grad is not None and np.any(p.grad != 0))
console.print(f"params_w_grad={params_with_grad}/{len(params)}", end=" ")
optimizer.step()
console.print("updated")
if step == 0:
initial_loss = loss_val
console.print(f" [yellow]→ Initial loss: {initial_loss:.4f}[/yellow]")
if step == 9:
final_loss = loss_val
if step % 2 == 0 and step > 0:
console.print(f" [cyan]→ Loss so far: {loss_val:.4f}[/cyan]")
# Result
console.print(f"\n[bold]Results:[/bold]")
console.print(f" Initial loss: {initial_loss:.4f}")
console.print(f" Final loss: {final_loss:.4f}")
console.print(f" Decrease: {initial_loss - final_loss:.4f}")
if final_loss < initial_loss * 0.8:
console.print(f" [green]✓ PASS: Loss decreased significantly[/green]")
return True
else:
console.print(f" [red]✗ FAIL: Loss didn't decrease enough[/red]")
console.print(f" [red]→ Bug in: autograd, optimizer, or forward pass[/red]")
return False
def run_test_1_pattern_completion():
"""
TEST 1: Pattern Completion
Can it learn: "A B A B A B" → next is "A"
"1 2 1 2 1 2" → next is "1"
Tests: Can model learn simple repeating patterns?
"""
console.print("\n" + "=" * 70)
console.print("[bold cyan]TEST 1: Pattern Completion[/bold cyan]")
console.print("=" * 70)
console.print("Task: Learn repeating patterns (ABAB... → A, 1212... → 1)")
console.print("Expected: Predict next token correctly after training")
console.print("Why: Tests if attention can learn simple sequences\n")
from tinytorch.core.tensor import Tensor
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.text.embeddings import Embedding, PositionalEncoding
from tinytorch.models.transformer import TransformerBlock, LayerNorm
from tinytorch.core.layers import Linear
# Create pattern data
patterns = [
"A B A B A B A B A B ",
"1 2 1 2 1 2 1 2 1 2 ",
"X Y X Y X Y X Y X Y ",
]
text = "".join(patterns * 50) # Repeat 50 times
console.print(f"Data: {len(text)} chars")
console.print(f"Patterns: ABAB, 1212, XYXY")
console.print(f"Sample: '{text[:40]}...'\n")
# Tokenize
chars = sorted(set(text))
vocab_size = len(chars)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}
data = np.array([char_to_idx[ch] for ch in text])
console.print(f"Vocab: {vocab_size} chars: {repr(''.join(chars))}\n")
# Build tiny model
embed_dim = 32
num_heads = 2
seq_len = 8
embedding = Embedding(vocab_size, embed_dim)
pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
ln = LayerNorm(embed_dim)
output_proj = Linear(embed_dim, vocab_size)
params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
# Set requires_grad
for p in params:
p.requires_grad = True
optimizer = Adam(params, lr=0.01)
criterion = CrossEntropyLoss()
console.print(f"[yellow]Training (30 steps on patterns)...[/yellow]")
initial_loss = None
final_loss = None
for step in range(30):
start = np.random.randint(0, len(data) - seq_len - 1)
input_seq = data[start:start+seq_len]
target_seq = data[start+1:start+seq_len+1]
x = Tensor(np.array([input_seq]))
y = Tensor(np.array([target_seq]))
x = embedding(x)
x = pos_enc(x)
x = transformer(x)
x = ln(x)
batch, seq, dim = x.shape
x_2d = x.reshape(batch * seq, dim)
logits_2d = output_proj(x_2d)
logits = logits_2d.reshape(batch, seq, vocab_size)
logits_flat = logits.reshape(batch * seq, vocab_size)
targets_flat = y.reshape(-1)
loss = criterion(logits_flat, targets_flat)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_val = float(loss.data)
if step == 0:
initial_loss = loss_val
if step == 29:
final_loss = loss_val
if step % 10 == 0 or step == 29:
console.print(f" Step {step+1}: Loss = {loss_val:.4f}")
decrease = initial_loss - final_loss
console.print(f"\n[bold]Results:[/bold]")
console.print(f" Initial: {initial_loss:.4f}")
console.print(f" Final: {final_loss:.4f}")
console.print(f" Decrease: {decrease:.4f}")
if decrease > 0.5:
console.print(f" [green]✓ PASS: Loss decreased significantly[/green]")
return True
else:
console.print(f" [red]✗ FAIL: Loss didn't decrease enough[/red]")
return False
def run_test_2_copy_task():
"""
TEST 2: Copy Task
Input: "COPY: hello"
Output: "hello"
Classic transformer test from research papers.
"""
console.print("\n" + "=" * 70)
console.print("[bold cyan]TEST 2: Copy Task[/bold cyan]")
console.print("=" * 70)
console.print("Task: COPY: X → X (reproduce input)")
console.print("Expected: Model learns to copy the input text")
console.print("Why: Classic test of attention mechanism\n")
from tinytorch.core.tensor import Tensor
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.text.embeddings import Embedding, PositionalEncoding
from tinytorch.models.transformer import TransformerBlock, LayerNorm
from tinytorch.core.layers import Linear
# Create copy task data
words = ["hello", "world", "test", "copy", "learn", "task"]
examples = []
for word in words:
examples.append(f"COPY:{word}={word} ")
text = "".join(examples * 50) # Repeat
console.print(f"Data: {len(text)} chars")
console.print(f"Examples: COPY:hello=hello, COPY:world=world")
console.print(f"Sample: '{text[:50]}...'\n")
# Tokenize
chars = sorted(set(text))
vocab_size = len(chars)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
data = np.array([char_to_idx[ch] for ch in text])
console.print(f"Vocab: {vocab_size} chars\n")
# Build model
embed_dim = 32
num_heads = 2
seq_len = 16
embedding = Embedding(vocab_size, embed_dim)
pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
ln = LayerNorm(embed_dim)
output_proj = Linear(embed_dim, vocab_size)
params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
for p in params:
p.requires_grad = True
optimizer = Adam(params, lr=0.01)
criterion = CrossEntropyLoss()
console.print(f"[yellow]Training (40 steps on copy task)...[/yellow]")
initial_loss = None
final_loss = None
for step in range(40):
start = np.random.randint(0, len(data) - seq_len - 1)
input_seq = data[start:start+seq_len]
target_seq = data[start+1:start+seq_len+1]
x = Tensor(np.array([input_seq]))
y = Tensor(np.array([target_seq]))
x = embedding(x)
x = pos_enc(x)
x = transformer(x)
x = ln(x)
batch, seq, dim = x.shape
x_2d = x.reshape(batch * seq, dim)
logits_2d = output_proj(x_2d)
logits = logits_2d.reshape(batch, seq, vocab_size)
logits_flat = logits.reshape(batch * seq, vocab_size)
targets_flat = y.reshape(-1)
loss = criterion(logits_flat, targets_flat)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_val = float(loss.data)
if step == 0:
initial_loss = loss_val
if step == 39:
final_loss = loss_val
if step % 10 == 0 or step == 39:
console.print(f" Step {step+1}: Loss = {loss_val:.4f}")
decrease = initial_loss - final_loss
console.print(f"\n[bold]Results:[/bold]")
console.print(f" Initial: {initial_loss:.4f}")
console.print(f" Final: {final_loss:.4f}")
console.print(f" Decrease: {decrease:.4f}")
if decrease > 0.5:
console.print(f" [green]✓ PASS: Loss decreased[/green]")
return True
else:
console.print(f" [red]✗ FAIL: Loss didn't decrease enough[/red]")
return False
def run_test_3_simple_arithmetic():
"""
TEST 3: Simple Arithmetic
2+3=5
1+1=2
5-2=3
Tests: Can model learn simple rules?
"""
console.print("\n" + "=" * 70)
console.print("[bold cyan]TEST 3: Simple Arithmetic[/bold cyan]")
console.print("=" * 70)
console.print("Task: 2+3=5, 1+1=2, etc. (single digit)")
console.print("Expected: Correct answers after training")
console.print("Why: Tests reasoning ability\n")
from tinytorch.core.tensor import Tensor
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.text.embeddings import Embedding, PositionalEncoding
from tinytorch.models.transformer import TransformerBlock, LayerNorm
from tinytorch.core.layers import Linear
# Create arithmetic data
examples = []
for a in range(1, 6):
for b in range(1, 6):
examples.append(f"{a}+{b}={a+b} ")
text = "".join(examples * 30) # Repeat
console.print(f"Data: {len(text)} chars")
console.print(f"Examples: 1+1=2, 2+3=5, 4+5=9")
console.print(f"Sample: '{text[:40]}...'\n")
# Tokenize
chars = sorted(set(text))
vocab_size = len(chars)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
data = np.array([char_to_idx[ch] for ch in text])
console.print(f"Vocab: {vocab_size} chars\n")
# Build model
embed_dim = 48
num_heads = 3
seq_len = 12
embedding = Embedding(vocab_size, embed_dim)
pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
ln = LayerNorm(embed_dim)
output_proj = Linear(embed_dim, vocab_size)
params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
for p in params:
p.requires_grad = True
optimizer = Adam(params, lr=0.01)
criterion = CrossEntropyLoss()
console.print(f"[yellow]Training (50 steps on arithmetic)...[/yellow]")
initial_loss = None
final_loss = None
for step in range(50):
start = np.random.randint(0, len(data) - seq_len - 1)
input_seq = data[start:start+seq_len]
target_seq = data[start+1:start+seq_len+1]
x = Tensor(np.array([input_seq]))
y = Tensor(np.array([target_seq]))
x = embedding(x)
x = pos_enc(x)
x = transformer(x)
x = ln(x)
batch, seq, dim = x.shape
x_2d = x.reshape(batch * seq, dim)
logits_2d = output_proj(x_2d)
logits = logits_2d.reshape(batch, seq, vocab_size)
logits_flat = logits.reshape(batch * seq, vocab_size)
targets_flat = y.reshape(-1)
loss = criterion(logits_flat, targets_flat)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_val = float(loss.data)
if step == 0:
initial_loss = loss_val
if step == 49:
final_loss = loss_val
if step % 10 == 0 or step == 49:
console.print(f" Step {step+1}: Loss = {loss_val:.4f}")
decrease = initial_loss - final_loss
console.print(f"\n[bold]Results:[/bold]")
console.print(f" Initial: {initial_loss:.4f}")
console.print(f" Final: {final_loss:.4f}")
console.print(f" Decrease: {decrease:.4f}")
if decrease > 0.3:
console.print(f" [green]✓ PASS: Loss decreased[/green]")
console.print(f" [dim](arithmetic is harder, so lower threshold)[/dim]")
return True
else:
console.print(f" [red]✗ FAIL: Loss didn't decrease enough[/red]")
return False
def run_test_4_tinytalks_level1():
"""
TEST 4: TinyTalks Level 1
Q: Hello!
A: Hi there!
The actual task we want to solve.
"""
console.print("\n" + "=" * 70)
console.print("[bold cyan]TEST 4: TinyTalks Level 1[/bold cyan]")
console.print("=" * 70)
console.print("Task: Learn greeting Q&A pairs from TinyTalks")
console.print("Expected: Can respond to greetings")
console.print("Why: The actual milestone goal\n")
from tinytorch.core.tensor import Tensor
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.text.embeddings import Embedding, PositionalEncoding
from tinytorch.models.transformer import TransformerBlock, LayerNorm
from tinytorch.core.layers import Linear
# Load TinyTalks Level 1 data
try:
with open("datasets/tinytalks/splits/train.txt", "r") as f:
full_text = f.read()
# Heuristic: Level 1 = very short Q&A (< 40 chars each)
lines = full_text.split('\n')
level_1_text = []
for i in range(0, len(lines) - 1, 3): # Q, A, blank
if i+1 < len(lines):
q_line = lines[i]
a_line = lines[i+1]
if q_line.startswith('Q:') and a_line.startswith('A:'):
if len(q_line) < 40 and len(a_line) < 40:
level_1_text.append(q_line + '\n' + a_line + '\n\n')
if not level_1_text:
console.print("[red]No Level 1 data found, using first 10 Q&A[/red]")
level_1_text = [full_text[:500]]
text = "".join(level_1_text[:10]) # First 10 simple Q&A
console.print(f"Data: {len(text)} chars (Level 1 greetings)")
console.print(f"Sample:\n{text[:100]}...\n")
except FileNotFoundError:
console.print("[red]TinyTalks not found, skipping Test 4[/red]")
return None
# Tokenize
chars = sorted(set(text))
vocab_size = len(chars)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
data = np.array([char_to_idx[ch] for ch in text])
console.print(f"Vocab: {vocab_size} chars\n")
# Build model (slightly larger for Q&A)
embed_dim = 64
num_heads = 4
seq_len = 32
embedding = Embedding(vocab_size, embed_dim)
pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
ln = LayerNorm(embed_dim)
output_proj = Linear(embed_dim, vocab_size)
params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
for p in params:
p.requires_grad = True
optimizer = Adam(params, lr=0.005) # Lower LR for Q&A
criterion = CrossEntropyLoss()
console.print(f"[yellow]Training (100 steps on TinyTalks Level 1)...[/yellow]")
initial_loss = None
final_loss = None
for step in range(100):
if len(data) < seq_len + 1:
console.print("[red]Dataset too small[/red]")
return None
start = np.random.randint(0, len(data) - seq_len - 1)
input_seq = data[start:start+seq_len]
target_seq = data[start+1:start+seq_len+1]
x = Tensor(np.array([input_seq]))
y = Tensor(np.array([target_seq]))
x = embedding(x)
x = pos_enc(x)
x = transformer(x)
x = ln(x)
batch, seq, dim = x.shape
x_2d = x.reshape(batch * seq, dim)
logits_2d = output_proj(x_2d)
logits = logits_2d.reshape(batch, seq, vocab_size)
logits_flat = logits.reshape(batch * seq, vocab_size)
targets_flat = y.reshape(-1)
loss = criterion(logits_flat, targets_flat)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_val = float(loss.data)
if step == 0:
initial_loss = loss_val
if step == 99:
final_loss = loss_val
if step % 20 == 0 or step == 99:
console.print(f" Step {step+1}: Loss = {loss_val:.4f}")
decrease = initial_loss - final_loss
console.print(f"\n[bold]Results:[/bold]")
console.print(f" Initial: {initial_loss:.4f}")
console.print(f" Final: {final_loss:.4f}")
console.print(f" Decrease: {decrease:.4f}")
if decrease > 0.3:
console.print(f" [green]✓ PASS: Model is learning TinyTalks![/green]")
console.print(f" [cyan]→ Now train full model with tinytalks_gpt.py[/cyan]")
return True
else:
console.print(f" [yellow]⚠ PARTIAL: Some learning, may need more steps[/yellow]")
return False
def main():
"""Run all tests in sequence"""
console.print("\n")
console.print(Panel(
"[bold cyan]TinyGPT Learning Diagnostic Suite[/bold cyan]\n\n"
"Progressive tests from simplest to complex:\n"
" 0. Single sequence memorization (MUST work)\n"
" 1. Pattern completion (A B A → B)\n"
" 2. Copy task (COPY: X → X)\n"
" 3. Simple arithmetic (2+3 → 5)\n"
" 4. TinyTalks greetings (Q&A)\n\n"
"[yellow]This identifies EXACTLY where learning breaks down[/yellow]",
title="🔬 Diagnostic Tests",
border_style="cyan",
box=box.DOUBLE
))
results = {}
# Run tests
try:
results[0] = run_test_0_memorize_sequence()
except Exception as e:
console.print(f"\n[red]Test 0 crashed: {str(e)}[/red]")
results[0] = False
# Only run next tests if previous passed
if results.get(0):
results[1] = run_test_1_pattern_completion()
results[2] = run_test_2_copy_task()
results[3] = run_test_3_simple_arithmetic()
results[4] = run_test_4_tinytalks_level1()
# Summary
console.print("\n" + "=" * 70)
console.print("[bold]Test Summary:[/bold]")
console.print("=" * 70)
for test_num, result in results.items():
if result is True:
console.print(f" Test {test_num}: [green]✓ PASS[/green]")
elif result is False:
console.print(f" Test {test_num}: [red]✗ FAIL[/red]")
else:
console.print(f" Test {test_num}: [yellow]○ TODO[/yellow]")
console.print("\n" + "=" * 70)
if results.get(0) is False:
console.print("[bold red]CRITICAL: Test 0 failed![/bold red]")
console.print("The transformer cannot even memorize a single sequence.")
console.print("This indicates a fundamental bug in:")
console.print(" - Forward pass computation")
console.print(" - Autograd backward pass")
console.print(" - Optimizer parameter updates")
console.print(" - Loss computation")
if __name__ == "__main__":
main()

View File

@@ -1,70 +0,0 @@
#!/usr/bin/env python3
"""
Quick diagnostic to test if the model can learn ANY pattern at all.
"""
import sys
import os
import numpy as np
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(project_root)
from tinytorch.core.tensor import Tensor
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.core.autograd import enable_autograd
from tinytorch.text.tokenization import CharTokenizer
# Enable autograd
enable_autograd()
# Super simple test: Can the model learn to predict "A" after "Q:"?
test_data = """Q: Hello!
A: Hi there!
Q: What is your name?
A: I am TinyBot.
Q: What color is the sky?
A: The sky is blue.
"""
print("Testing if model can learn simple patterns...")
print(f"Test data: {repr(test_data[:100])}...")
# Build tokenizer
tokenizer = CharTokenizer()
tokenizer.build_vocab([test_data])
tokens = tokenizer.encode(test_data)
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Total tokens: {len(tokens)}")
print(f"First 20 tokens: {tokens[:20]}")
print(f"Decoded: {repr(tokenizer.decode(tokens[:20]))}")
# Check specific patterns
q_colon_tokens = tokenizer.encode("Q:")
print(f"\n'Q:' tokens: {q_colon_tokens}")
print(f"'Q:' decoded: {repr(tokenizer.decode(q_colon_tokens))}")
a_colon_tokens = tokenizer.encode("A:")
print(f"'A:' tokens: {a_colon_tokens}")
print(f"'A:' decoded: {repr(tokenizer.decode(a_colon_tokens))}")
# Find all occurrences of "Q:" followed by space/newline then "A:"
print("\nPattern analysis:")
text_str = test_data
q_count = text_str.count("Q:")
a_count = text_str.count("A:")
print(f"'Q:' appears: {q_count} times")
print(f"'A:' appears: {a_count} times")
print("\n✅ Tokenizer is working correctly!")
print("\nConclusion: The model should be able to learn that 'A:' follows 'Q:'")
print("If it's generating garbage, the model is either:")
print(" 1. Too small (need more parameters)")
print(" 2. Not trained enough (need more epochs)")
print(" 3. Learning rate is wrong")
print(" 4. Or there's a bug in the training loop")

View File

@@ -1,604 +0,0 @@
#!/usr/bin/env python3
"""
TinyStories Text Generation (2017) - Transformer Era
====================================================
📚 HISTORICAL CONTEXT:
In 2017, Vaswani et al. published "Attention Is All You Need", showing that
attention mechanisms alone (no RNNs!) could achieve state-of-the-art results
on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs.
🎯 WHAT YOU'RE BUILDING:
Using YOUR TinyTorch implementations, you'll build a character-level language model
that generates simple stories - proving YOUR attention mechanism works!
TinyStories is MUCH EASIER than Shakespeare:
- Simple vocabulary (children's stories vs archaic English)
- Clear sentence structure
- Designed specifically for small models like ours!
- Faster convergence and better results
✅ REQUIRED MODULES (Run after Module 13):
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Module 02 (Tensor) : YOUR data structure with autograd
Module 03 (Activations) : YOUR ReLU in feed-forward networks
Module 04 (Layers) : YOUR Linear layers
Module 08 (Optimizers) : YOUR Adam optimizer
Module 10 (Tokenization) : YOUR CharTokenizer for text→numbers
Module 11 (Embeddings) : YOUR token & positional embeddings
Module 12 (Attention) : YOUR multi-head self-attention
Module 13 (Transformers) : YOUR LayerNorm + TransformerBlock
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🏗️ ARCHITECTURE (Character-Level Language Model):
┌──────────────────────────────────────────────────────────────────────────────┐
│ Output Predictions │
│ Character Probabilities (vocab_size) │
└──────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────────────┐
│ Output Projection │
│ Module 04: vectors → vocabulary │
└──────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────────────┐
│ Layer Norm │
│ Module 13: Final normalization │
└──────────────────────────────────────────────────────────────────────────────┘
╔══════════════════════════════════════════════════════════════════════════════╗
║ Transformer Block × N (Repeat) ║
║ ┌────────────────────────────────────────────────────────────────────────┐ ║
║ │ Feed Forward Network │ ║
║ │ Module 04: Linear → ReLU → Linear │ ║
║ └────────────────────────────────────────────────────────────────────────┘ ║
║ ▲ ║
║ ┌────────────────────────────────────────────────────────────────────────┐ ║
║ │ Multi-Head Self-Attention │ ║
║ │ Module 12: Query·Key^T·Value across all positions │ ║
║ └────────────────────────────────────────────────────────────────────────┘ ║
╚══════════════════════════════════════════════════════════════════════════════╝
┌──────────────────────────────────────────────────────────────────────────────┐
│ Positional Encoding │
│ Module 11: Add position information │
└──────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────────────┐
│ Character Embeddings │
│ Module 11: chars → embed_dim vectors │
└──────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────────────┐
│ Input Characters │
"To be or not to be, that is..."
└──────────────────────────────────────────────────────────────────────────────┘
📊 EXPECTED PERFORMANCE:
- Dataset: ~21MB TinyStories validation set (simple children's stories)
- Training time: 30-45 minutes (proper training, faster than Shakespeare!)
- Vocabulary: ~90 unique characters (simple English)
- Expected: Coherent simple stories with proper grammar
- Parameters: ~4.8M (perfect size for this task)
"""
import sys
import os
import numpy as np
import argparse
import time
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich import box
# Add project root to path
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(project_root)
console = Console()
# Import TinyTorch components YOU BUILT!
from tinytorch.core.tensor import Tensor # Module 02: YOU built this!
from tinytorch.core.layers import Linear # Module 04: YOU built this!
from tinytorch.core.activations import ReLU, Softmax # Module 03: YOU built this!
from tinytorch.core.optimizers import Adam # Module 08: YOU built this!
from tinytorch.core.losses import CrossEntropyLoss # Module 04: YOU built this!
from tinytorch.text.tokenization import CharTokenizer # Module 10: YOU built this!
from tinytorch.text.embeddings import Embedding, PositionalEncoding # Module 11: YOU built this!
from tinytorch.core.attention import MultiHeadAttention # Module 12: YOU built this!
from tinytorch.models.transformer import LayerNorm, TransformerBlock # Module 13: YOU built this!
from tinytorch.data.loader import DataLoader, Dataset # Module 08: YOU built this!
# Import dataset manager
from data_manager import DatasetManager
class TinyStoriesDataset(Dataset):
"""
Character-level TinyStories dataset using YOUR Dataset interface (Module 08)
and YOUR CharTokenizer (Module 10)!
Tokenizes simple children's stories into characters for language modeling.
Much easier to learn than Shakespeare!
"""
def __init__(self, text, seq_length=64):
"""
Initialize dataset with text and sequence length.
Args:
text: Raw Shakespeare text
seq_length: Length of input sequences
"""
# Use YOUR CharTokenizer from Module 10!
self.tokenizer = CharTokenizer()
self.tokenizer.build_vocab([text]) # Build vocabulary from Shakespeare corpus
self.vocab_size = self.tokenizer.vocab_size
# Convert text to indices using YOUR tokenizer!
self.data = self.tokenizer.encode(text)
self.seq_length = seq_length
# Calculate number of sequences
self.num_sequences = len(self.data) - seq_length
def __getitem__(self, idx):
"""Get a single training sequence - YOUR Dataset interface!"""
# Input: characters at positions [idx, idx+seq_length)
# Target: characters at positions [idx+1, idx+seq_length+1)
input_seq = self.data[idx:idx + self.seq_length]
target_seq = self.data[idx + 1:idx + self.seq_length + 1]
return Tensor(np.array(input_seq, dtype=np.int32)), Tensor(np.array(target_seq, dtype=np.int32))
def __len__(self):
"""Return dataset size - YOUR Dataset interface!"""
return self.num_sequences
def decode(self, indices):
"""Convert indices back to text using YOUR tokenizer!"""
return self.tokenizer.decode(indices)
class TinyGPT:
"""
Character-level Transformer Language Model using YOUR TinyTorch!
This architecture is what powers GPT, ChatGPT, and modern LLMs.
"""
def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers):
# Token representation
self.embedding = Embedding(vocab_size, embed_dim) # Module 11!
self.pos_encoding = PositionalEncoding(max_length, embed_dim) # Module 11!
# Transformer stack
self.layers = []
mlp_ratio = 4 # Standard 4x expansion in FFN (embed_dim * 4)
for _ in range(num_layers):
block = TransformerBlock(embed_dim, num_heads, mlp_ratio) # Module 13!
self.layers.append(block)
# Output head
self.layer_norm = LayerNorm(embed_dim) # Module 13!
self.output_proj = Linear(embed_dim, vocab_size) # Module 04!
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
# Calculate parameters
self.total_params = self._count_parameters()
def _count_parameters(self):
"""Count total parameters in model."""
count = 0
for param in self.parameters():
count += param.data.size
return count
def parameters(self):
"""Get all trainable parameters from YOUR model."""
params = []
# Embedding parameters
params.extend([self.embedding.weight])
params.extend(self.pos_encoding.parameters()) # Add positional encoding params!
# Transformer block parameters
for layer in self.layers:
if hasattr(layer, 'parameters'):
if callable(layer.parameters):
params.extend(layer.parameters())
else:
params.extend(layer.parameters)
# Output projection parameters
params.extend([self.layer_norm.gamma, self.layer_norm.beta])
params.extend([self.output_proj.weight, self.output_proj.bias])
# Ensure all parameters have requires_grad=True
for param in params:
param.requires_grad = True
return params
def forward(self, x):
"""Forward pass through YOUR transformer stack."""
# Convert tokens to contextual vectors
x = self.embedding.forward(x) # Module 11: char → vectors
x = self.pos_encoding.forward(x) # Module 11: add position info
# Process through transformer layers
for layer in self.layers:
x = layer.forward(x) # Module 13: Attention → FFN
# Generate predictions
x = self.layer_norm.forward(x) # Module 13: final norm
# Reshape for Linear layer - KEEP COMPUTATION GRAPH!
batch_size, seq_len, embed_dim = x.shape
x_2d = x.reshape(batch_size * seq_len, embed_dim) # Use Tensor.reshape()
# Apply output projection
logits_2d = self.output_proj(x_2d) # Module 04: vocab predictions
# Reshape back - KEEP COMPUTATION GRAPH!
logits = logits_2d.reshape(batch_size, seq_len, self.vocab_size) # Use Tensor.reshape()
return logits
def visualize_transformer():
"""Show how transformers process text sequences."""
console.print("")
console.print(Panel.fit(
"[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n"
"[yellow]The Problem:[/yellow]\n"
"RNNs process sequences one step at a time\n"
"Can't parallelize → slow training on long sequences\n"
"Struggle with long-range dependencies\n\n"
"[green]The Innovation:[/green]\n"
"Transformers: Attention mechanisms process ENTIRE sequences in parallel\n"
" • Self-attention: Every token attends to every other token\n"
" • Multi-head attention: Learn multiple attention patterns\n"
" • Positional encoding: Preserve sequence order\n\n"
"[bold]Can attention alone match RNN performance?[/bold]",
title="🎯 ACT 1: THE CHALLENGE",
border_style="cyan",
box=box.DOUBLE
))
console.print("""
How YOUR Transformer Sees Text: What It Learns:
Input: "To be or not to be" Layer 1 (Attention):
┌─────────────────────┐ • Each word attends to others
│ T o b e o r ... │ • "be" looks at "To", "or", etc.
└─────────────────────┘ • Captures dependencies
Character Embeddings Layer 2-4 (Deep Attention):
┌─────────────────────┐ • Builds complex patterns
│ 128-dim vectors │ • Grammar, style, meaning
│ for each character │ • Shakespeare-specific patterns
└─────────────────────┘
↓ Output Prediction:
Position Encoding "To be or not to be, that is the"
┌─────────────────────┐ ↓
│ Add positional info │ Next char probabilities:
│ (order matters!) │ 't' → 0.85 (highest!)
└─────────────────────┘ 'n' → 0.03
'a' → 0.02
Transformer Layers ×4 ...
┌─────────────────────┐
│ Self-Attention │ Key Transformer Insight:
│ Feed-Forward │ Unlike RNNs, attention lets each
│ Layer Norm │ position look at ALL others
└─────────────────────┘ simultaneously - capturing long-range
↓ dependencies in O(1) operations!
Character Predictions
┌─────────────────────┐
│ Probability for │
│ each next character │
└─────────────────────┘
""")
print("="*70)
def train_tinystories_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.01):
"""Train TinyGPT using YOUR complete training system with DataLoader!"""
console.print("\n[bold]🚀 Training TinyStories TinyGPT with YOUR TinyTorch![/bold]")
console.print(f" Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences")
console.print(f" Batch size: [cyan]{train_loader.batch_size}[/cyan]")
console.print(f" Learning rate: [cyan]{learning_rate}[/cyan] (1e-2, optimal for 4.8M param model)")
console.print(f" YOUR DataLoader (Module 08) handles batching!")
console.print(f" YOUR Adam optimizer (Module 08)")
console.print(f" YOUR CrossEntropyLoss (Module 04) with autograd!")
# YOUR optimizer and loss function
# Using 1e-2 learning rate (optimal for our 4.8M param model, validated by debug script)
# Note: Large models (100M+) use 3e-4, but smaller models need higher LR
optimizer = Adam(model.parameters(), lr=learning_rate)
loss_fn = CrossEntropyLoss() # YOUR loss function with autograd!
for epoch in range(epochs):
console.print(f"\n [bold]Epoch {epoch+1}/{epochs}:[/bold]")
epoch_loss = 0
batch_count = 0
# Use YOUR DataLoader to iterate through batches!
for batch_idx, (batch_input, batch_target) in enumerate(train_loader):
if batch_idx >= 500: # Training mode - process more batches
break
if batch_idx == 0:
console.print(f" [dim]Processing first batch... (this may take a moment)[/dim]")
# Forward pass with YOUR Transformer
logits = model(batch_input) # YOUR attention mechanism!
# Reshape for loss computation: (batch, seq, vocab) -> (batch*seq, vocab)
# IMPORTANT: Use Tensor.reshape() to preserve computation graph!
batch_size, seq_length, vocab_size = logits.shape
logits_2d = logits.reshape(batch_size * seq_length, vocab_size)
targets_1d = batch_target.reshape(-1)
# Compute loss with YOUR CrossEntropyLoss (connects to autograd!)
loss = loss_fn.forward(logits_2d, targets_1d) # Module 04 + Module 05!
loss_value = float(loss.data)
# Backward pass with YOUR autograd
optimizer.zero_grad() # Module 08!
loss.backward() # Module 05: YOUR autodiff!
optimizer.step() # Module 08!
epoch_loss += loss_value
batch_count += 1
# Progress - show output frequently so user sees continuous training
if batch_idx == 0 or (batch_idx + 1) % 10 == 0 or (batch_idx + 1) % 50 == 0:
avg_loss = epoch_loss / batch_count
console.print(f" Batch {batch_idx+1}/500 | Loss: {loss_value:.4f} | Avg: {avg_loss:.4f}")
# Epoch summary
avg_loss = epoch_loss / max(1, batch_count)
console.print(f" → Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)")
return model
def generate_text(model, dataset, prompt="To be or not", max_length=200, temperature=0.8):
"""
Generate text from a prompt - THE WOW MOMENT!
This is autoregressive generation: predict next char, add it, repeat.
"""
console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]")
console.print("="*70)
# Convert prompt to indices
prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx]
generated = prompt_indices.copy()
console.print(f"📝 Prompt: [cyan]\"{prompt}\"[/cyan]")
console.print(f"🎯 Generating [cyan]{max_length}[/cyan] characters...\n")
# Generate character by character
for _ in range(max_length):
# Take last seq_length characters as input
input_seq = generated[-dataset.seq_length:] if len(generated) >= dataset.seq_length else generated
# Pad if necessary
if len(input_seq) < dataset.seq_length:
input_seq = [0] * (dataset.seq_length - len(input_seq)) + input_seq
# Forward pass
input_tensor = Tensor(np.array([input_seq], dtype=np.int32))
logits = model(input_tensor)
# Get logits for last position
logits_np = np.array(logits.data.data if hasattr(logits.data, 'data') else logits.data)
next_logits = logits_np[0, -1, :] # Last position predictions
# Apply temperature and sample
next_logits = next_logits / temperature
exp_logits = np.exp(next_logits - np.max(next_logits))
probs = exp_logits / np.sum(exp_logits)
# Sample from distribution
next_idx = np.random.choice(len(probs), p=probs)
generated.append(next_idx)
# Decode to text
generated_text = dataset.decode(generated)
console.print("[bold]📖 Generated Text:[/bold]")
console.print("" * 70)
console.print(f"[green]{generated_text}[/green]")
console.print("" * 70)
return generated_text
def analyze_transformer_systems(model):
"""Analyze YOUR Transformer from an ML systems perspective."""
console.print("")
console.print(Panel.fit(
f"[bold]Model Architecture:[/bold]\n"
f" • Parameters: [cyan]{model.total_params:,}[/cyan] weights\n"
f" • Embedding dim: [cyan]{model.embed_dim}[/cyan]\n"
f" • Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n"
"[bold]Computational Complexity:[/bold]\n"
" • Attention: O(n²·d) where n=sequence, d=dimension\n"
" • Self-attention allows parallel processing (vs RNN sequential)\n"
" • YOUR implementation: Pure Python + NumPy\n\n"
f"[bold]Memory Requirements:[/bold]\n"
f" • Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n"
" • Attention matrices: O(n²) per layer\n"
" • YOUR TinyTorch tracks gradients automatically\n\n"
"[bold]🏛️ Transformer Evolution:[/bold]\n"
" • 2017: Vaswani et al. 'Attention Is All You Need'\n"
" • 2018: BERT (bidirectional), GPT (autoregressive)\n"
" • 2020: GPT-3 (175B params, same architecture!)\n"
" • 2022: ChatGPT (YOUR architecture at massive scale)\n"
" • YOUR TinyGPT: Core principles that power them all!\n\n"
"[bold]💡 Why Transformers Dominate:[/bold]\n"
" • Parallelizable (vs sequential RNNs)\n"
" • Long-range dependencies (attention sees everything)\n"
" • Scalable (architecture works from 1M to 175B params)\n"
" • YOUR implementation demonstrates all of these!",
title="🔬 SYSTEMS ANALYSIS",
border_style="cyan",
box=box.DOUBLE
))
def main():
"""Demonstrate Shakespeare text generation using YOUR TinyTorch!"""
parser = argparse.ArgumentParser(description='Shakespeare Transformer 2017')
parser.add_argument('--test-only', action='store_true',
help='Test architecture only')
parser.add_argument('--epochs', type=int, default=20,
help='Training epochs')
parser.add_argument('--batch-size', type=int, default=32,
help='Batch size')
parser.add_argument('--seq-length', type=int, default=128,
help='Sequence length')
parser.add_argument('--embed-dim', type=int, default=256,
help='Embedding dimension')
parser.add_argument('--num-layers', type=int, default=6,
help='Number of transformer layers')
parser.add_argument('--num-heads', type=int, default=8,
help='Number of attention heads')
parser.add_argument('--visualize', action='store_true', default=True,
help='Show transformer visualization')
parser.add_argument('--quick-test', action='store_true',
help='Use small subset for testing')
args = parser.parse_args()
console.print("")
console.print(Panel.fit(
"[bold cyan]TinyStories Transformer - Simple Story Generation![/bold cyan]\n\n"
"[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n"
"[green]YOUR achievement:[/green] Generate coherent children's stories\n"
"[cyan]Components used:[/cyan] YOUR complete NLP pipeline (Modules 2, 3, 4, 8, 10, 11, 12, 13)\n"
"[dim]Note: TinyStories is much easier than Shakespeare - designed for small models![/dim]",
title="🎯 Milestone 05: Transformer Era (2017)",
border_style="cyan",
box=box.DOUBLE
))
# Visualization
if args.visualize:
visualize_transformer()
# Step 1: Load TinyStories dataset
console.print("\n[bold]📥 Loading TinyStories dataset...[/bold]")
# Load TinyStories from downloaded file
tinystories_path = os.path.join(
os.path.dirname(__file__),
'../datasets/tinystories/tinystories_val.txt'
)
if not os.path.exists(tinystories_path):
console.print(f"[red]❌ TinyStories not found at {tinystories_path}[/red]")
console.print("[yellow]Run: python milestones/05_2017_transformer/download_tinystories.py[/yellow]")
return
with open(tinystories_path, 'r', encoding='utf-8') as f:
text = f.read()
console.print(f"📊 Loaded: {len(text):,} characters, {len(text.split()):,} words")
if args.quick_test:
text = text[:100000] # Use small subset for testing (100K chars)
console.print(" [dim](Using 100K char subset for quick testing)[/dim]")
# Step 2: Create Dataset and DataLoader using YOUR Module 08!
console.print(f"\n[bold]📦 Creating YOUR Dataset and DataLoader (Module 08)...[/bold]")
dataset = TinyStoriesDataset(text, seq_length=args.seq_length)
# YOUR DataLoader handles batching and shuffling!
train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters")
console.print(f" Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]")
console.print(f" DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]")
# Step 3: Build Transformer
model = TinyGPT(
vocab_size=dataset.vocab_size,
embed_dim=args.embed_dim,
max_length=args.seq_length,
num_heads=args.num_heads,
num_layers=args.num_layers
)
# Display model info
console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]")
console.print(f" Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings")
console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters")
console.print(f" Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)")
if args.test_only:
console.print("\n[bold yellow]🧪 ARCHITECTURE TEST MODE[/bold yellow]")
# Test with minimal data
test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32))
test_output = model(test_input)
console.print(f"[green]✅ Forward pass successful! Output shape: {test_output.data.shape}[/green]")
console.print(f"[green]✅ YOUR Transformer + DataLoader work together![/green]")
return
# Step 4: Train using YOUR DataLoader
start_time = time.time()
model = train_tinystories_gpt(model, train_loader, dataset, epochs=args.epochs)
train_time = time.time() - start_time
# Step 5: Generate text!
generated = generate_text(model, dataset, prompt="Once upon a time", max_length=200)
# Additional generation examples
console.print("\n[bold]🎭 More Generation Examples:[/bold]")
console.print("" * 70)
prompts = ["ROMEO:", "The king", "What is"]
for prompt in prompts:
if all(ch in dataset.char_to_idx for ch in prompt):
console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]")
gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8)
# Step 6: Systems Analysis
analyze_transformer_systems(model)
console.print(f"\n[bold]⏱️ Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds")
console.print(f" Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]")
console.print("")
console.print(Panel.fit(
"[bold green]✅ SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n"
"[bold]🎓 What YOU Accomplished:[/bold]\n"
" • YOUR attention mechanism processes sequences in parallel\n"
" • YOUR transformer captures long-range text dependencies\n"
" • YOUR DataLoader efficiently batches character sequences\n"
" • YOUR TinyGPT generates coherent text!\n"
" • YOUR complete language modeling system works!\n\n"
"[bold]🚀 Next Steps:[/bold]\n"
" • Continue to Module 14 (KV-Caching) for 3x faster inference\n"
" • YOUR transformer architecture scales to GPT-scale models\n"
" • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!",
title="🌟 2017 Transformer Revolution Complete",
border_style="green",
box=box.DOUBLE
))
if __name__ == "__main__":
main()

View File

@@ -1,375 +0,0 @@
"""
TinyTalks Chatbot - Train a Simple Conversational AI in 10-15 Minutes
======================================================================
A minimal but functional chatbot trained on simple Q&A pairs.
Goal: Show that transformers can learn conversational patterns quickly!
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import numpy as np
import time
from tinytorch.core.tensor import Tensor
from tinytorch.core.autograd import enable_autograd
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.models.transformer import GPT
from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
enable_autograd()
# ============================================================================
# Tokenization
# ============================================================================
def create_tokenizer(conversations):
"""Create character-level tokenizer with special tokens."""
# Get all unique characters
all_text = ' '.join([q + ' ' + a for q, a in conversations])
all_chars = sorted(set(all_text))
# Special tokens
special_tokens = {
'<PAD>': 0,
'<SOS>': 1, # Start of sequence
'<SEP>': 2, # Separator between Q and A
'<EOS>': 3, # End of sequence
}
# Character mappings
char_to_idx = {**special_tokens}
idx_to_char = {v: k for k, v in special_tokens.items()}
for idx, char in enumerate(all_chars, start=len(special_tokens)):
char_to_idx[char] = idx
idx_to_char[idx] = char
return char_to_idx, idx_to_char
def encode_conversation(question, answer, char_to_idx, max_len=80):
"""
Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>...
Example:
Q: "Hi"
A: "Hello"
→ [<SOS>, H, i, <SEP>, H, e, l, l, o, <EOS>, <PAD>, ...]
"""
# Build sequence
tokens = [char_to_idx['<SOS>']]
# Add question
for c in question:
tokens.append(char_to_idx.get(c, 0))
# Add separator
tokens.append(char_to_idx['<SEP>'])
# Add answer
for c in answer:
tokens.append(char_to_idx.get(c, 0))
# Add EOS
tokens.append(char_to_idx['<EOS>'])
# Pad
if len(tokens) < max_len:
tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
return tokens
def decode_tokens(tokens, idx_to_char, stop_at_eos=True):
"""Decode tokens to string."""
chars = []
for t in tokens:
if t == 0: # PAD
if stop_at_eos:
break
elif t == 1: # SOS
continue
elif t == 2: # SEP
chars.append(' | ')
elif t == 3: # EOS
if stop_at_eos:
break
else:
chars.append(idx_to_char.get(t, '?'))
return ''.join(chars)
# ============================================================================
# Training
# ============================================================================
def train_chatbot(model, optimizer, loss_fn, train_data, max_time_minutes=10):
"""
Train TinyTalks chatbot.
"""
max_time_seconds = max_time_minutes * 60
print("=" * 70)
print(f"TRAINING TINYTALKS CHATBOT FOR {max_time_minutes} MINUTES")
print("=" * 70)
print(f"Dataset: {len(train_data)} conversations")
print(f"Time limit: {max_time_seconds}s ({max_time_minutes} minutes)")
print()
start_time = time.time()
losses = []
step = 0
# Progress checkpoints every 2 minutes
checkpoint_interval = 120 # 2 minutes
next_checkpoint = checkpoint_interval
print("Training started...")
print()
while True:
elapsed = time.time() - start_time
if elapsed >= max_time_seconds:
break
# Sample random conversation
tokens = train_data[np.random.randint(len(train_data))]
# Next token prediction
input_seq = tokens[:-1]
target_seq = tokens[1:]
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
# Forward
logits = model.forward(x)
# Loss
batch_size, seq_len, vocab_size = logits.shape
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
targets_flat = y_true.reshape(batch_size * seq_len)
loss = loss_fn.forward(logits_flat, targets_flat)
# Backward
optimizer.zero_grad()
loss.backward()
# Clip gradients
for param in model.parameters():
if param.grad is not None:
np.clip(param.grad, -1.0, 1.0, out=param.grad)
# Update
optimizer.step()
losses.append(loss.data.item())
step += 1
# Show progress at checkpoints
if elapsed >= next_checkpoint:
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
steps_per_sec = step / elapsed
mins = int(elapsed / 60)
print(f"[{mins:2d} min] Step {step:5d} | Loss: {avg_loss:.4f} | Speed: {steps_per_sec:.1f} steps/sec")
next_checkpoint += checkpoint_interval
# Also show every 500 steps for early progress
if step % 500 == 0 and step <= 2000:
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
print(f"[{int(elapsed):4d}s] Step {step:5d} | Loss: {avg_loss:.4f}")
final_elapsed = time.time() - start_time
final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
initial_loss = np.mean(losses[:10])
improvement = (1 - final_loss / initial_loss) * 100
print()
print("=" * 70)
print("TRAINING COMPLETE")
print("=" * 70)
print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.1f} minutes)")
print(f"Total steps: {step:,}")
print(f"Steps/second: {step/final_elapsed:.1f}")
print(f"Initial loss: {initial_loss:.4f}")
print(f"Final loss: {final_loss:.4f}")
print(f"Improvement: {improvement:.1f}%")
print()
return losses, step
# ============================================================================
# Generation / Chat
# ============================================================================
def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
"""
Generate response to a question.
Process:
1. Encode: <SOS> question <SEP>
2. Generate tokens until <EOS> or max_len
3. Decode generated tokens
"""
# Encode question
tokens = [char_to_idx['<SOS>']]
for c in question:
tokens.append(char_to_idx.get(c, 0))
tokens.append(char_to_idx['<SEP>'])
# Generate response
generated_tokens = []
for _ in range(max_len):
# Pad input to model's expected length
input_tokens = tokens + generated_tokens
while len(input_tokens) < 80: # Match training max_len
input_tokens.append(char_to_idx['<PAD>'])
input_tokens = input_tokens[:80]
# Forward pass
x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
logits = model.forward(x)
# Get next token (position after current sequence)
next_pos = len(tokens) + len(generated_tokens) - 1
if next_pos < logits.shape[1]:
next_logits = logits.data[0, next_pos, :]
next_token = int(np.argmax(next_logits))
# Stop at EOS or PAD
if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
break
generated_tokens.append(next_token)
else:
break
# Decode generated response
response = decode_tokens(generated_tokens, idx_to_char, stop_at_eos=False)
return response
def test_chatbot(model, test_questions, char_to_idx, idx_to_char):
"""Test chatbot on sample questions."""
print("=" * 70)
print("TESTING CHATBOT")
print("=" * 70)
print()
for question in test_questions:
response = generate_response(model, question, char_to_idx, idx_to_char)
print(f"Q: {question}")
print(f"A: {response}")
print()
# ============================================================================
# Main
# ============================================================================
def main():
print()
print("=" * 70)
print("TINYTALKS CHATBOT - 10-15 MINUTE TRAINING")
print("=" * 70)
print()
# Load dataset
conversations = create_tinytalks_dataset()
stats = get_dataset_stats()
print(f"Dataset: {stats['total_examples']} examples ({stats['unique_examples']} unique)")
print(f"Repetition: {stats['repetition_factor']:.1f}x for better learning")
print(f"Avg lengths: Q={stats['avg_question_len']:.1f} chars, A={stats['avg_answer_len']:.1f} chars")
print()
# Create tokenizer
char_to_idx, idx_to_char = create_tokenizer(conversations)
vocab_size = len(idx_to_char)
print(f"Vocabulary: {vocab_size} tokens (including special tokens)")
print()
# Encode dataset
max_seq_len = 80
train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
# Model: Ultra-tiny for speed (learned from 5-min test!)
# Target: ~20-30 steps/sec with longer sequences
# In 10 mins (600s): ~12,000-18,000 steps
config = {
'vocab_size': vocab_size,
'embed_dim': 16, # Keep it tiny!
'num_layers': 1, # Just 1 layer
'num_heads': 2, # 2 heads
'max_seq_len': max_seq_len,
}
print("Model configuration:")
for key, val in config.items():
print(f" {key}: {val}")
print()
model = GPT(**config)
num_params = sum(np.prod(p.shape) for p in model.parameters())
print(f"Parameters: {num_params:,}")
print()
# Optimizer
optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = CrossEntropyLoss()
# Train for 15 minutes (adjustable)
train_time = 15 # minutes
print(f"Training for {train_time} minutes...")
print()
losses, total_steps = train_chatbot(
model=model,
optimizer=optimizer,
loss_fn=loss_fn,
train_data=train_data,
max_time_minutes=train_time
)
# Test with sample questions
test_questions = [
"Hi",
"How are you",
"What is your name",
"What is the sky",
"Is grass green",
"What is 1 plus 1",
"Are you happy",
"Bye",
]
print("Testing chatbot responses...")
print()
test_chatbot(model, test_questions, char_to_idx, idx_to_char)
# Summary
print("=" * 70)
print("TINYTALKS SUMMARY")
print("=" * 70)
print(f"✓ Model: {num_params:,} parameters")
print(f"✓ Training: {train_time} minutes, {total_steps:,} steps")
print(f"✓ Loss: {np.mean(losses[:10]):.4f}{np.mean(losses[-100:]):.4f}")
print(f"✓ Improvement: {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}%")
print()
print("Try it yourself:")
print(" 1. Ask simple questions from the training set")
print(" 2. The model should generate learned responses")
print(" 3. Experiment with model size and training time!")
print()
if __name__ == "__main__":
main()

View File

@@ -1,546 +0,0 @@
"""
TinyTalks Interactive Dashboard - Watch Learning Happen Live!
=============================================================
A beautiful, educational dashboard showing a transformer learn to chat.
Students see:
- Live training metrics
- Responses improving from gibberish to coherent
- Real-time checkpoints with before/after comparison
- Visual feedback on what's correct vs incorrect
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import numpy as np
import time
from tinytorch.core.tensor import Tensor
from tinytorch.core.autograd import enable_autograd
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.models.transformer import GPT
from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
enable_autograd()
# Rich CLI imports
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich.layout import Layout
from rich.live import Live
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
from rich import box
from rich.text import Text
console = Console()
# ============================================================================
# Tokenization (same as tinytalks_chatbot.py)
# ============================================================================
def create_tokenizer(conversations):
"""Create character-level tokenizer with special tokens."""
all_text = ' '.join([q + ' ' + a for q, a in conversations])
all_chars = sorted(set(all_text))
special_tokens = {
'<PAD>': 0,
'<SOS>': 1,
'<SEP>': 2,
'<EOS>': 3,
}
char_to_idx = {**special_tokens}
idx_to_char = {v: k for k, v in special_tokens.items()}
for idx, char in enumerate(all_chars, start=len(special_tokens)):
char_to_idx[char] = idx
idx_to_char[idx] = char
return char_to_idx, idx_to_char
def encode_conversation(question, answer, char_to_idx, max_len=80):
"""Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>..."""
tokens = [char_to_idx['<SOS>']]
for c in question:
tokens.append(char_to_idx.get(c, 0))
tokens.append(char_to_idx['<SEP>'])
for c in answer:
tokens.append(char_to_idx.get(c, 0))
tokens.append(char_to_idx['<EOS>'])
if len(tokens) < max_len:
tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
return tokens
def decode_tokens(tokens, idx_to_char):
"""Decode tokens to string."""
chars = []
for t in tokens:
if t == 0 or t == 1: # PAD or SOS
continue
elif t == 2: # SEP
continue
elif t == 3: # EOS
break
else:
chars.append(idx_to_char.get(t, '?'))
return ''.join(chars)
def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
"""Generate response to a question."""
tokens = [char_to_idx['<SOS>']]
for c in question:
tokens.append(char_to_idx.get(c, 0))
tokens.append(char_to_idx['<SEP>'])
generated_tokens = []
for _ in range(max_len):
input_tokens = tokens + generated_tokens
while len(input_tokens) < 80:
input_tokens.append(char_to_idx['<PAD>'])
input_tokens = input_tokens[:80]
x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
logits = model.forward(x)
next_pos = len(tokens) + len(generated_tokens) - 1
if next_pos < logits.shape[1]:
next_logits = logits.data[0, next_pos, :]
next_token = int(np.argmax(next_logits))
if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
break
generated_tokens.append(next_token)
else:
break
response = decode_tokens(generated_tokens, idx_to_char)
return response
# ============================================================================
# Dashboard Components
# ============================================================================
def create_welcome_panel():
"""Create the welcome panel."""
return Panel.fit(
"[bold cyan]🤖 TINYTALKS - Watch a Transformer Learn to Chat![/bold cyan]\n\n"
"[dim]You're about to see AI learning happen in real-time.\n"
"The model starts knowing nothing - just random noise.\n"
"Every training step makes it slightly smarter.\n"
"Watch responses improve from gibberish to coherent conversation![/dim]\n\n"
"[bold]Training Duration:[/bold] 10-15 minutes\n"
"[bold]Checkpoints:[/bold] Every ~2 minutes\n"
"[bold]What to watch:[/bold] Loss ↓ = Better responses ✓",
title="🎓 Educational AI Training Demo",
border_style="cyan",
box=box.DOUBLE
)
def create_metrics_table(step, loss, elapsed, steps_per_sec):
"""Create current training metrics table."""
table = Table(show_header=False, box=box.SIMPLE, padding=(0, 2))
table.add_column("Metric", style="cyan")
table.add_column("Value", style="green bold")
table.add_row("Step", f"{step:,}")
table.add_row("Loss", f"{loss:.4f}")
table.add_row("Time", f"{int(elapsed/60)}m {int(elapsed%60)}s")
table.add_row("Speed", f"{steps_per_sec:.1f} steps/sec")
return table
def create_checkpoint_comparison(checkpoint_num, step, loss, test_results, expected_answers):
"""Create a checkpoint panel showing test results."""
# Count correct
correct = 0
for (q, actual), expected in zip(test_results, expected_answers):
if actual.strip().lower() == expected.strip().lower():
correct += 1
accuracy = (correct / len(test_results)) * 100
# Create results table
table = Table(
title=f"Checkpoint {checkpoint_num} - Step {step:,} | Loss: {loss:.4f} | Accuracy: {accuracy:.0f}%",
box=box.ROUNDED,
show_header=True
)
table.add_column("Question", style="cyan", width=22)
table.add_column("Model Response", style="white", width=28)
table.add_column("Status", justify="center", width=8)
for (question, actual), expected in zip(test_results, expected_answers):
# Determine if correct
is_correct = actual.strip().lower() == expected.strip().lower()
is_close = expected.strip().lower() in actual.strip().lower() or actual.strip().lower() in expected.strip().lower()
# Color code and emoji
if is_correct:
status = "[green]✓ Perfect[/green]"
response_style = "green"
elif is_close:
status = "[yellow]≈ Close[/yellow]"
response_style = "yellow"
elif len(actual.strip()) > 0:
status = "[red]✗ Wrong[/red]"
response_style = "red"
else:
status = "[dim]- Empty[/dim]"
response_style = "dim"
# Truncate long responses
display_response = actual[:26] + "..." if len(actual) > 26 else actual
table.add_row(
question,
f"[{response_style}]{display_response}[/{response_style}]",
status
)
return table
def create_progress_panel(step, total_steps, checkpoint_num, total_checkpoints):
"""Create progress indicators panel."""
step_progress = (step / total_steps) * 100 if total_steps > 0 else 0
checkpoint_progress = (checkpoint_num / total_checkpoints) * 100 if total_checkpoints > 0 else 0
# Progress bars (ASCII style)
step_bar_filled = int(step_progress / 2.5) # 40 chars max
step_bar = "[" + "=" * step_bar_filled + " " * (40 - step_bar_filled) + "]"
checkpoint_bar_filled = int(checkpoint_progress / 2.5)
checkpoint_bar = "[" + "=" * checkpoint_bar_filled + " " * (40 - checkpoint_bar_filled) + "]"
text = (
f"[bold]Training Progress:[/bold]\n"
f"{step_bar} {step_progress:.1f}% ({step}/{total_steps} steps)\n\n"
f"[bold]Checkpoints:[/bold]\n"
f"{checkpoint_bar} {checkpoint_progress:.1f}% ({checkpoint_num}/{total_checkpoints} completed)"
)
return Panel(text, title="📊 Progress", border_style="blue")
# ============================================================================
# Training with Dashboard
# ============================================================================
def train_with_dashboard(model, optimizer, loss_fn, train_data, test_questions, expected_answers,
char_to_idx, idx_to_char, max_time_minutes=10, checkpoint_interval_steps=1500):
"""
Train with beautiful dashboard showing live progress.
"""
max_time_seconds = max_time_minutes * 60
console.clear()
console.print(create_welcome_panel())
console.print()
input("[bold cyan]Press ENTER to start training...[/bold cyan]")
console.clear()
# Training setup
start_time = time.time()
losses = []
step = 0
checkpoint_num = 0
# Calculate expected checkpoints
estimated_total_steps = int(max_time_seconds * 12) # ~12 steps/sec
total_checkpoints = estimated_total_steps // checkpoint_interval_steps
# Initial evaluation
console.print("\n[bold]📊 CHECKPOINT 0: Initial Model (Untrained)[/bold]\n")
initial_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
console.print(create_checkpoint_comparison(0, 0, 999.9, initial_results, expected_answers))
console.print()
console.print("[dim]Starting training... Watch the responses improve![/dim]\n")
time.sleep(2)
next_checkpoint = checkpoint_interval_steps
last_print_time = time.time()
# Training loop
while True:
elapsed = time.time() - start_time
if elapsed >= max_time_seconds:
break
# Training step
tokens = train_data[np.random.randint(len(train_data))]
input_seq = tokens[:-1]
target_seq = tokens[1:]
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
logits = model.forward(x)
batch_size, seq_len, vocab_size = logits.shape
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
targets_flat = y_true.reshape(batch_size * seq_len)
loss = loss_fn.forward(logits_flat, targets_flat)
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
if param.grad is not None:
np.clip(param.grad, -1.0, 1.0, out=param.grad)
optimizer.step()
losses.append(loss.data.item())
step += 1
# Print progress every 5 seconds
if time.time() - last_print_time >= 5.0:
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
steps_per_sec = step / elapsed
console.print(
f"[dim]Step {step:5d} | "
f"Loss: {avg_loss:.4f} | "
f"Time: {int(elapsed/60)}m{int(elapsed%60):02d}s | "
f"Speed: {steps_per_sec:.1f} steps/sec[/dim]"
)
last_print_time = time.time()
# Checkpoint evaluation
if step >= next_checkpoint:
checkpoint_num += 1
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
console.print("\n" + "="*70)
console.print(f"[bold yellow]⏸️ CHECKPOINT {checkpoint_num}[/bold yellow]")
console.print(f"[dim]Pausing training to evaluate... (Step {step:,})[/dim]\n")
# Evaluate
current_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
# Show results
console.print(create_checkpoint_comparison(checkpoint_num, step, avg_loss, current_results, expected_answers))
console.print()
# Show progress
console.print(create_progress_panel(step, estimated_total_steps, checkpoint_num, total_checkpoints))
console.print()
console.print("[dim]Continuing training...[/dim]\n")
next_checkpoint += checkpoint_interval_steps
time.sleep(1)
# Final results
final_elapsed = time.time() - start_time
final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
initial_loss = np.mean(losses[:10])
improvement = (1 - final_loss / initial_loss) * 100
console.print("\n" + "="*70)
console.print("[bold green]🎉 TRAINING COMPLETE![/bold green]\n")
# Final evaluation
final_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
console.print(create_checkpoint_comparison("FINAL", step, final_loss, final_results, expected_answers))
console.print()
# Summary table
summary = Table(title="Training Summary", box=box.DOUBLE, show_header=True)
summary.add_column("Metric", style="cyan", width=30)
summary.add_column("Value", style="green bold", width=30)
summary.add_row("Total Training Time", f"{final_elapsed/60:.1f} minutes")
summary.add_row("Total Steps", f"{step:,}")
summary.add_row("Steps/Second", f"{step/final_elapsed:.1f}")
summary.add_row("Initial Loss", f"{initial_loss:.4f}")
summary.add_row("Final Loss", f"{final_loss:.4f}")
summary.add_row("Improvement", f"{improvement:.1f}%")
summary.add_row("Checkpoints Evaluated", f"{checkpoint_num}")
console.print(summary)
console.print()
# Count perfect responses for milestone card
correct = sum(1 for (q, actual), expected in zip(final_results, expected_answers)
if actual.strip().lower() == expected.strip().lower())
accuracy = (correct / len(test_questions)) * 100
return losses, step, accuracy
# ============================================================================
# Main
# ============================================================================
def main():
# Dataset
conversations = create_tinytalks_dataset()
char_to_idx, idx_to_char = create_tokenizer(conversations)
vocab_size = len(idx_to_char)
# Encode
max_seq_len = 80
train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
# Test questions and expected answers
test_questions = [
"Hi",
"How are you",
"What is your name",
"What is the sky",
"Is grass green",
"What is 1 plus 1",
"Are you happy"
]
expected_answers = [
"Hello! How can I help you?",
"I am doing well, thanks!",
"I am TinyBot",
"The sky is blue",
"Yes, grass is green",
"1 plus 1 equals 2",
"Yes, I am happy"
]
# Model
config = {
'vocab_size': vocab_size,
'embed_dim': 16,
'num_layers': 1,
'num_heads': 2,
'max_seq_len': max_seq_len,
}
model = GPT(**config)
num_params = sum(np.prod(p.shape) for p in model.parameters())
# Optimizer
optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = CrossEntropyLoss()
# Train with dashboard
train_time = 15 # 15 minutes for better results
checkpoint_interval = 2000 # Every ~2.5 minutes
console.print(Panel.fit(
f"[bold]Model:[/bold] {num_params:,} parameters (ultra-tiny!)\n"
f"[bold]Training Time:[/bold] {train_time} minutes\n"
f"[bold]Checkpoints:[/bold] Every {checkpoint_interval} steps (~2 min)\n"
f"[bold]Test Questions:[/bold] {len(test_questions)} questions\n\n"
f"[dim]Watch loss decrease and responses improve![/dim]",
title="⚙️ Configuration",
border_style="blue"
))
losses, total_steps, final_accuracy = train_with_dashboard(
model=model,
optimizer=optimizer,
loss_fn=loss_fn,
train_data=train_data,
test_questions=test_questions,
expected_answers=expected_answers,
char_to_idx=char_to_idx,
idx_to_char=idx_to_char,
max_time_minutes=train_time,
checkpoint_interval_steps=checkpoint_interval
)
# Calculate metrics for milestone card
loss_improvement = (1 - np.mean(losses[-100:]) / np.mean(losses[:10])) * 100
# Milestone completion card
console.print()
if final_accuracy >= 50 and loss_improvement >= 80:
console.print(Panel.fit(
"[bold green]🎉 Congratulations! You've Built a Working Chatbot![/bold green]\n\n"
f"Final accuracy: [bold]{final_accuracy:.0f}%[/bold] | "
f"Loss improved: [bold]{loss_improvement:.1f}%[/bold]\n\n"
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
"[bold]💡 What YOU Just Accomplished:[/bold]\n"
" ✓ Built a TRANSFORMER (2017 Vaswani et al)\n"
" ✓ Trained with attention mechanism from scratch\n"
" ✓ Watched AI learn language patterns in real-time\n"
" ✓ Demonstrated gradient descent on complex architectures\n"
f" ✓ Trained {total_steps:,} steps in {train_time} minutes!\n\n"
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
"[bold]🎓 Why This Matters:[/bold]\n"
" This is the SAME architecture behind ChatGPT, GPT-4, and BERT.\n"
" You just witnessed the magic of:\n"
" • Self-attention (learning relationships between words)\n"
" • Position encoding (understanding word order)\n"
" • Autoregressive generation (predicting next token)\n\n"
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
"[bold]📌 The Key Insight:[/bold]\n"
" You saw responses evolve from gibberish to coherent:\n"
" Checkpoint 0: Random noise\n"
" Checkpoint 1: Recognizable words\n"
" Checkpoint 2: Partial sentences\n"
" Final: Perfect responses!\n"
" \n"
" [yellow]Scale it up:[/yellow] Same process, more data, more params →\n"
" You get GPT-4 (175B params, trained for weeks)!\n\n"
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
"[bold]🚀 What You Can Do Now:[/bold]\n"
"• Experiment with different architectures (layers, heads)\n"
"• Try longer training (15-20 minutes for better results)\n"
"• Add more conversation patterns to the dataset\n"
"• Scale up the model (more parameters = better learning)\n\n"
"[bold cyan]You've mastered the foundation of modern AI! 🌟[/bold cyan]",
title="🌟 2017 Transformer Complete - Milestone 05",
border_style="green",
box=box.DOUBLE
))
else:
console.print(Panel.fit(
"[bold yellow]⚠️ Training Complete - Needs More Time[/bold yellow]\n\n"
f"Current accuracy: {final_accuracy:.0f}% | Loss improved: {loss_improvement:.1f}%\n\n"
"Your transformer is learning but needs more training time.\n\n"
"[bold]What to try:[/bold]\n"
"• Train for 15-20 minutes instead of 10\n"
"• Use a slightly bigger model (2 layers, 24 dims)\n"
"• Add more data repetition for reinforcement\n\n"
"[dim]The attention mechanism is working - it just needs more steps to converge!\n"
"Even partial success shows the transformer learned patterns.[/dim]",
title="🔄 Learning in Progress",
border_style="yellow",
box=box.DOUBLE
))
if __name__ == "__main__":
main()

View File

@@ -1,208 +0,0 @@
"""
TinyTalks Dataset - Small Conversational Dataset for Transformer Training
==========================================================================
A carefully curated micro-dataset for training a chatbot in 10-15 minutes.
Design Principles:
- Simple Q&A format
- Short responses (< 50 characters)
- Common patterns repeated for learning
- Character-level tokenization friendly
- Covers basic conversational topics
"""
def create_tinytalks_dataset():
"""
Create a small but diverse conversational dataset.
Categories:
1. Greetings & Politeness
2. Simple Facts
3. Yes/No Questions
4. Weather & Time
5. Feelings & States
6. Simple Math
7. Colors & Objects
8. Names & Identity
"""
conversations = [
# === GREETINGS (repeated 5x for strong learning) ===
("Hi", "Hello! How can I help you?"),
("Hello", "Hi there! How are you?"),
("Good morning", "Good morning! Nice day!"),
("How are you", "I am doing well, thanks!"),
("Bye", "Goodbye! Have a great day!"),
# Repeat for learning
("Hi", "Hello! How can I help you?"),
("Hello", "Hi there! How are you?"),
("How are you", "I am doing well, thanks!"),
("Bye", "Goodbye! Have a great day!"),
("Hi", "Hello! How can I help you?"),
("Hello", "Hi there! How are you?"),
("Bye", "Goodbye! Have a great day!"),
# === SIMPLE FACTS (repeated 3x) ===
("What is the sky", "The sky is blue"),
("What is grass", "Grass is green"),
("What is the sun", "The sun is bright and yellow"),
("What is water", "Water is wet and clear"),
("What is fire", "Fire is hot and red"),
("What is the sky", "The sky is blue"),
("What is grass", "Grass is green"),
("What is water", "Water is wet and clear"),
("What is the sky", "The sky is blue"),
("What is grass", "Grass is green"),
# === YES/NO QUESTIONS (repeated 3x) ===
("Is the sky blue", "Yes, the sky is blue"),
("Is grass green", "Yes, grass is green"),
("Is the sun cold", "No, the sun is hot"),
("Is water dry", "No, water is wet"),
("Is fire safe", "No, fire is dangerous"),
("Is the sky blue", "Yes, the sky is blue"),
("Is grass green", "Yes, grass is green"),
("Is the sun cold", "No, the sun is hot"),
("Is the sky blue", "Yes, the sky is blue"),
("Is grass green", "Yes, grass is green"),
# === WEATHER (repeated 3x) ===
("Is it sunny", "Yes, it is sunny today"),
("Is it raining", "No, it is not raining"),
("Is it cold", "No, it is warm today"),
("What is the weather", "The weather is nice"),
("Is it sunny", "Yes, it is sunny today"),
("What is the weather", "The weather is nice"),
("Is it sunny", "Yes, it is sunny today"),
# === FEELINGS (repeated 3x) ===
("Are you happy", "Yes, I am happy"),
("Are you sad", "No, I am not sad"),
("Are you tired", "No, I feel good"),
("Do you like learning", "Yes, I love learning"),
("Are you happy", "Yes, I am happy"),
("Do you like learning", "Yes, I love learning"),
("Are you happy", "Yes, I am happy"),
# === SIMPLE MATH (repeated 3x) ===
("What is 1 plus 1", "1 plus 1 equals 2"),
("What is 2 plus 2", "2 plus 2 equals 4"),
("What is 3 plus 3", "3 plus 3 equals 6"),
("What is 5 plus 5", "5 plus 5 equals 10"),
("What is 1 plus 1", "1 plus 1 equals 2"),
("What is 2 plus 2", "2 plus 2 equals 4"),
("What is 1 plus 1", "1 plus 1 equals 2"),
# === COLORS (repeated 3x) ===
("What color is the sky", "The sky is blue"),
("What color is grass", "Grass is green"),
("What color is the sun", "The sun is yellow"),
("What color is snow", "Snow is white"),
("What color is the sky", "The sky is blue"),
("What color is grass", "Grass is green"),
("What color is the sky", "The sky is blue"),
# === IDENTITY (repeated 3x) ===
("What is your name", "I am TinyBot"),
("Who are you", "I am TinyBot, your helper"),
("What do you do", "I help answer questions"),
("What is your name", "I am TinyBot"),
("Who are you", "I am TinyBot, your helper"),
("What is your name", "I am TinyBot"),
# === CAPABILITIES (repeated 2x) ===
("Can you help me", "Yes, I can help you"),
("Can you talk", "Yes, I can talk with you"),
("Do you understand", "Yes, I understand you"),
("Can you help me", "Yes, I can help you"),
("Can you talk", "Yes, I can talk with you"),
]
return conversations
def get_dataset_stats():
"""Get statistics about the dataset."""
conversations = create_tinytalks_dataset()
unique_conversations = set(conversations)
total_chars = sum(len(q) + len(a) for q, a in conversations)
avg_question_len = sum(len(q) for q, _ in conversations) / len(conversations)
avg_answer_len = sum(len(a) for _, a in conversations) / len(conversations)
return {
'total_examples': len(conversations),
'unique_examples': len(unique_conversations),
'repetition_factor': len(conversations) / len(unique_conversations),
'total_chars': total_chars,
'avg_question_len': avg_question_len,
'avg_answer_len': avg_answer_len,
'categories': [
'Greetings (5x repeat)',
'Simple Facts (3x repeat)',
'Yes/No Questions (3x repeat)',
'Weather (3x repeat)',
'Feelings (3x repeat)',
'Simple Math (3x repeat)',
'Colors (3x repeat)',
'Identity (3x repeat)',
'Capabilities (2x repeat)'
]
}
def print_dataset_info():
"""Print dataset information."""
conversations = create_tinytalks_dataset()
stats = get_dataset_stats()
print("=" * 70)
print("TINYTALKS DATASET")
print("=" * 70)
print()
print(f"Total examples: {stats['total_examples']}")
print(f"Unique examples: {stats['unique_examples']}")
print(f"Repetition factor: {stats['repetition_factor']:.1f}x")
print(f"Average question length: {stats['avg_question_len']:.1f} chars")
print(f"Average answer length: {stats['avg_answer_len']:.1f} chars")
print()
print("Categories:")
for cat in stats['categories']:
print(f"{cat}")
print()
print("Sample conversations:")
print("-" * 70)
# Show 10 random unique examples
unique = list(set(conversations))
import random
random.seed(42)
samples = random.sample(unique, min(10, len(unique)))
for q, a in samples:
print(f"Q: {q}")
print(f"A: {a}")
print()
if __name__ == "__main__":
print_dataset_info()

View File

@@ -1,746 +0,0 @@
#!/usr/bin/env python3
"""
TinyTalks Q&A Generation (2017) - Transformer Era
==================================================
📚 HISTORICAL CONTEXT:
In 2017, Vaswani et al. published "Attention Is All You Need", showing that
attention mechanisms alone (no RNNs!) could achieve state-of-the-art results
on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs.
🎯 WHAT YOU'RE BUILDING:
Using YOUR TinyTorch implementations, you'll build a character-level conversational
model that learns to answer questions - proving YOUR attention mechanism works!
TinyTalks is PERFECT for learning:
- Small dataset (17.5 KB) = 3-5 minute training!
- Clear Q&A format (easy to verify learning)
- Progressive difficulty (5 levels)
- Instant gratification: Watch your transformer learn to chat!
✅ REQUIRED MODULES (Run after Module 13):
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Module 01 (Tensor) : YOUR data structure with autograd
Module 02 (Activations) : YOUR ReLU and GELU activations
Module 03 (Layers) : YOUR Linear layers
Module 04 (Losses) : YOUR CrossEntropyLoss
Module 05 (Autograd) : YOUR automatic differentiation
Module 06 (Optimizers) : YOUR Adam optimizer
Module 08 (DataLoader) : YOUR data batching
Module 10 (Tokenization) : YOUR CharTokenizer for text→numbers
Module 11 (Embeddings) : YOUR token & positional embeddings
Module 12 (Attention) : YOUR multi-head self-attention
Module 13 (Transformers) : YOUR LayerNorm + TransformerBlock + GPT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🏗️ ARCHITECTURE (Character-Level Q&A Model):
┌──────────────────────────────────────────────────────────────────────────────┐
│ Output Predictions │
│ Character Probabilities (vocab_size) │
└──────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────────────┐
│ Output Projection │
│ Module 03: vectors → vocabulary │
└──────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────────────┐
│ Layer Norm │
│ Module 13: Final normalization │
└──────────────────────────────────────────────────────────────────────────────┘
╔══════════════════════════════════════════════════════════════════════════════╗
║ Transformer Block × N (Repeat) ║
║ ┌────────────────────────────────────────────────────────────────────────┐ ║
║ │ Feed Forward Network │ ║
║ │ Module 03: Linear → GELU → Linear │ ║
║ └────────────────────────────────────────────────────────────────────────┘ ║
║ ▲ ║
║ ┌────────────────────────────────────────────────────────────────────────┐ ║
║ │ Multi-Head Self-Attention │ ║
║ │ Module 12: Query·Key^T·Value across all positions │ ║
║ └────────────────────────────────────────────────────────────────────────┘ ║
╚══════════════════════════════════════════════════════════════════════════════╝
┌──────────────────────────────────────────────────────────────────────────────┐
│ Positional Encoding │
│ Module 11: Add position information │
└──────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────────────┐
│ Character Embeddings │
│ Module 11: chars → embed_dim vectors │
└──────────────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────────────┐
│ Input Characters │
"Q: What color is the sky? A:"
└──────────────────────────────────────────────────────────────────────────────┘
📊 EXPECTED PERFORMANCE:
- Dataset: 17.5 KB TinyTalks (301 Q&A pairs, 5 difficulty levels)
- Training time: 3-5 minutes (instant gratification!)
- Vocabulary: ~68 unique characters (simple English Q&A)
- Expected: 70-80% accuracy on Level 1-2 questions after training
- Parameters: ~1.2M (perfect size for fast learning on small data)
💡 WHAT TO WATCH FOR:
- Epoch 1-3: Model learns Q&A structure ("A:" follows "Q:")
- Epoch 4-7: Starts giving sensible (if incorrect) answers
- Epoch 8-12: 50-60% accuracy on simple questions
- Epoch 13-20: 70-80% accuracy, proper grammar
- Success = "Wow, my transformer actually learned to answer questions!"
"""
import sys
import os
import numpy as np
import argparse
import time
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich import box
# Add project root to path
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(project_root)
console = Console()
def print_banner():
"""Print a beautiful banner for the milestone"""
banner_text = """
╔══════════════════════════════════════════════════════════════════╗
║ ║
║ 🤖 TinyTalks Q&A Bot Training (2017) ║
║ Transformer Architecture ║
║ ║
"Your first transformer learning to answer questions!"
║ ║
╚══════════════════════════════════════════════════════════════════╝
"""
console.print(Panel(banner_text, border_style="bright_blue", box=box.DOUBLE))
def filter_by_levels(text, levels):
"""
Filter TinyTalks dataset to only include specified difficulty levels.
Levels are marked in the original generation as:
L1: Greetings (47 pairs)
L2: Facts (82 pairs)
L3: Math (45 pairs)
L4: Reasoning (87 pairs)
L5: Context (40 pairs)
For simplicity, we filter by common patterns:
L1: Hello, Hi, What is your name, etc.
L2: What color, How many, etc.
L3: What is X plus/minus, etc.
"""
if levels is None or levels == [1, 2, 3, 4, 5]:
return text # Use full dataset
# Parse Q&A pairs
pairs = []
blocks = text.strip().split('\n\n')
for block in blocks:
lines = block.strip().split('\n')
if len(lines) == 2 and lines[0].startswith('Q:') and lines[1].startswith('A:'):
q = lines[0][3:].strip()
a = lines[1][3:].strip()
# Classify level (heuristic)
level = 5 # default
q_lower = q.lower()
if any(word in q_lower for word in ['hello', 'hi', 'hey', 'goodbye', 'bye', 'name', 'who are you', 'what are you']):
level = 1
elif any(word in q_lower for word in ['color', 'legs', 'days', 'months', 'sound', 'capital']):
level = 2
elif any(word in q_lower for word in ['plus', 'minus', 'times', 'divided', 'equals']):
level = 3
elif any(word in q_lower for word in ['use', 'where do', 'what do', 'happens if', 'need to']):
level = 4
if level in levels:
pairs.append(f"Q: {q}\nA: {a}")
filtered_text = '\n\n'.join(pairs)
console.print(f"[yellow]📊 Filtered to Level(s) {levels}:[/yellow]")
console.print(f" Q&A pairs: {len(pairs)}")
console.print(f" Characters: {len(filtered_text)}")
return filtered_text
class TinyTalksDataset:
"""
Character-level dataset for TinyTalks Q&A.
Creates sequences of characters for autoregressive language modeling:
- Input: "Q: What color is the sky? A: The sk"
- Target: ": What color is the sky? A: The sky"
The model learns to predict the next character given previous characters,
naturally learning the Q&A pattern.
"""
def __init__(self, text, seq_length=64, levels=None):
"""
Args:
text: Full text string (Q&A pairs)
seq_length: Length of input sequences
levels: List of difficulty levels to include (1-5), None = all
"""
from tinytorch.text.tokenization import CharTokenizer
self.seq_length = seq_length
# Filter by levels if specified
if levels:
text = filter_by_levels(text, levels)
# Store original text for testing
self.text = text
# Build character vocabulary using CharTokenizer
self.tokenizer = CharTokenizer()
self.tokenizer.build_vocab([text])
# Encode entire text
self.data = self.tokenizer.encode(text)
console.print(f"[green]✓[/green] Dataset initialized:")
console.print(f" Total characters: {len(text)}")
console.print(f" Vocabulary size: {self.tokenizer.vocab_size}")
console.print(f" Sequence length: {seq_length}")
console.print(f" Total sequences: {len(self)}")
def __len__(self):
"""Number of possible sequences"""
return len(self.data) - self.seq_length
def __getitem__(self, idx):
"""
Get one training example.
Returns:
input_seq: Characters [idx : idx+seq_length]
target_seq: Characters [idx+1 : idx+seq_length+1] (shifted by 1)
"""
input_seq = self.data[idx:idx + self.seq_length]
target_seq = self.data[idx + 1:idx + self.seq_length + 1]
return input_seq, target_seq
def decode(self, indices):
"""Decode token indices back to text"""
return self.tokenizer.decode(indices)
class TinyGPT:
"""
Character-level GPT model for TinyTalks Q&A.
This is a simplified GPT architecture:
1. Token embeddings (convert characters to vectors)
2. Positional encodings (add position information)
3. N transformer blocks (self-attention + feed-forward)
4. Output projection (vectors back to character probabilities)
Built entirely from YOUR TinyTorch modules!
"""
def __init__(self, vocab_size, embed_dim=128, num_layers=4, num_heads=4,
max_seq_len=64, dropout=0.1):
"""
Args:
vocab_size: Number of unique characters
embed_dim: Dimension of embeddings and hidden states
num_layers: Number of transformer blocks
num_heads: Number of attention heads per block
max_seq_len: Maximum sequence length
dropout: Dropout probability (for training)
"""
from tinytorch.core.tensor import Tensor
from tinytorch.text.embeddings import Embedding, PositionalEncoding
from tinytorch.models.transformer import LayerNorm, TransformerBlock
from tinytorch.core.layers import Linear
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
self.max_seq_len = max_seq_len
# 1. Token embeddings: char_id → embed_dim vector
self.token_embedding = Embedding(vocab_size, embed_dim)
# 2. Positional encoding: add position information
self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
# 3. Transformer blocks (stacked)
self.blocks = []
for _ in range(num_layers):
block = TransformerBlock(
embed_dim=embed_dim,
num_heads=num_heads,
mlp_ratio=4, # FFN hidden_dim = 4 * embed_dim
dropout_prob=dropout
)
self.blocks.append(block)
# 4. Final layer normalization
self.ln_f = LayerNorm(embed_dim)
# 5. Output projection: embed_dim → vocab_size
self.output_proj = Linear(embed_dim, vocab_size)
console.print(f"[green]✓[/green] TinyGPT model initialized:")
console.print(f" Vocabulary: {vocab_size}")
console.print(f" Embedding dim: {embed_dim}")
console.print(f" Layers: {num_layers}")
console.print(f" Heads: {num_heads}")
console.print(f" Max sequence: {max_seq_len}")
# Count parameters
total_params = self.count_parameters()
console.print(f" [bold]Total parameters: {total_params:,}[/bold]")
def forward(self, x):
"""
Forward pass through the model.
Args:
x: Input tensor of shape (batch, seq_len) with token indices
Returns:
logits: Output tensor of shape (batch, seq_len, vocab_size)
"""
from tinytorch.core.tensor import Tensor
# 1. Token embeddings: (batch, seq_len) → (batch, seq_len, embed_dim)
x = self.token_embedding.forward(x)
# 2. Add positional encoding
x = self.pos_encoding.forward(x)
# 3. Pass through transformer blocks
for block in self.blocks:
x = block.forward(x)
# 4. Final layer norm
x = self.ln_f.forward(x)
# 5. Project to vocabulary: (batch, seq_len, embed_dim) → (batch, seq_len, vocab_size)
logits = self.output_proj.forward(x)
return logits
def parameters(self):
"""Get all trainable parameters"""
params = []
# Token embeddings
params.extend(self.token_embedding.parameters())
# Positional encoding (learnable parameters)
params.extend(self.pos_encoding.parameters())
# Transformer blocks
for block in self.blocks:
params.extend(block.parameters())
# Final layer norm
params.extend(self.ln_f.parameters())
# Output projection
params.extend(self.output_proj.parameters())
# Ensure all require gradients
for param in params:
param.requires_grad = True
return params
def count_parameters(self):
"""Count total trainable parameters"""
total = 0
for param in self.parameters():
total += param.data.size
return total
def generate(self, tokenizer, prompt="Q:", max_new_tokens=100, temperature=1.0):
"""
Generate text autoregressively.
Args:
tokenizer: CharTokenizer for encoding/decoding
prompt: Starting text
max_new_tokens: How many characters to generate
temperature: Sampling temperature (higher = more random)
Returns:
Generated text string
"""
from tinytorch.core.tensor import Tensor
# Encode prompt
indices = tokenizer.encode(prompt)
# Generate tokens one at a time
for _ in range(max_new_tokens):
# Get last max_seq_len tokens (context window)
context = indices[-self.max_seq_len:]
# Prepare input: (1, seq_len)
x_input = Tensor(np.array([context]))
# Forward pass
logits = self.forward(x_input)
# Get logits for last position: (vocab_size,)
last_logits = logits.data[0, -1, :] / temperature
# Apply softmax to get probabilities
exp_logits = np.exp(last_logits - np.max(last_logits))
probs = exp_logits / np.sum(exp_logits)
# Sample from distribution
next_idx = np.random.choice(len(probs), p=probs)
# Append to sequence
indices.append(next_idx)
# Stop if we generate newline after "A:"
if len(indices) > 3 and tokenizer.decode(indices[-3:]) == "\n\nQ":
break
return tokenizer.decode(indices)
def test_model_predictions(model, dataset, test_prompts=None):
"""Test model on specific prompts and show predictions"""
if test_prompts is None:
test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: Hi!"]
console.print("\n[bold yellow]🧪 Testing Live Predictions:[/bold yellow]")
for prompt in test_prompts:
try:
full_prompt = prompt + "\nA:"
response = model.generate(dataset.tokenizer, prompt=full_prompt, max_new_tokens=30, temperature=0.5)
# Extract just the answer
if "\nA:" in response:
answer = response.split("\nA:")[1].split("\n")[0].strip()
else:
answer = response[len(full_prompt):].strip()
console.print(f" {prompt}")
console.print(f" → [cyan]{answer}[/cyan]")
except Exception as e:
console.print(f" {prompt} → [red]Error: {str(e)[:50]}[/red]")
def train_tinytalks_gpt(model, dataset, optimizer, criterion, epochs=20, batch_size=32,
log_interval=50, test_prompts=None):
"""
Train the TinyGPT model on TinyTalks dataset.
Training loop:
1. Sample random batch of sequences
2. Forward pass: predict next character for each position
3. Compute cross-entropy loss
4. Backward pass: compute gradients
5. Update parameters with Adam
6. Periodically test on sample questions to show learning
Args:
model: TinyGPT instance
dataset: TinyTalksDataset instance
optimizer: Adam optimizer
criterion: CrossEntropyLoss
epochs: Number of training epochs
batch_size: Number of sequences per batch
log_interval: Print loss every N batches
test_prompts: Optional list of questions to test during training
"""
from tinytorch.core.tensor import Tensor
from tinytorch.core.autograd import enable_autograd
# Enable autograd
enable_autograd()
console.print("\n[bold cyan]Starting Training...[/bold cyan]")
console.print(f" Epochs: {epochs}")
console.print(f" Batch size: {batch_size}")
console.print(f" Dataset size: {len(dataset)} sequences")
start_time = time.time()
for epoch in range(epochs):
epoch_start = time.time()
epoch_loss = 0.0
num_batches = 0
# Calculate batches per epoch
batches_per_epoch = min(500, len(dataset) // batch_size)
for batch_idx in range(batches_per_epoch):
# Sample random batch
batch_indices = np.random.randint(0, len(dataset), size=batch_size)
batch_inputs = []
batch_targets = []
for idx in batch_indices:
input_seq, target_seq = dataset[int(idx)]
batch_inputs.append(input_seq)
batch_targets.append(target_seq)
# Convert to tensors: (batch, seq_len)
batch_input = Tensor(np.array(batch_inputs))
batch_target = Tensor(np.array(batch_targets))
# Forward pass
logits = model.forward(batch_input)
# Reshape for loss computation: (batch, seq, vocab) → (batch*seq, vocab)
# IMPORTANT: Use Tensor.reshape() to preserve computation graph!
batch_size_actual, seq_length, vocab_size = logits.shape
logits_2d = logits.reshape(batch_size_actual * seq_length, vocab_size)
targets_1d = batch_target.reshape(-1)
# Compute loss
loss = criterion.forward(logits_2d, targets_1d)
# Backward pass
loss.backward()
# Update parameters
optimizer.step()
# Zero gradients
optimizer.zero_grad()
# Track loss
batch_loss = float(loss.data)
epoch_loss += batch_loss
num_batches += 1
# Log progress
if (batch_idx + 1) % log_interval == 0 or batch_idx == 0:
avg_loss = epoch_loss / num_batches
elapsed = time.time() - start_time
console.print(
f" Epoch {epoch+1}/{epochs} | "
f"Batch {batch_idx+1}/{batches_per_epoch} | "
f"Loss: {batch_loss:.4f} | "
f"Avg: {avg_loss:.4f} | "
f"Time: {elapsed:.1f}s"
)
# Epoch summary
avg_epoch_loss = epoch_loss / num_batches
epoch_time = time.time() - epoch_start
console.print(
f"[green]✓[/green] Epoch {epoch+1}/{epochs} complete | "
f"Avg Loss: {avg_epoch_loss:.4f} | "
f"Time: {epoch_time:.1f}s"
)
# Test model every 5 epochs to show learning progress
if (epoch + 1) % 5 == 0 or epoch == 0 or epoch == epochs - 1:
test_model_predictions(model, dataset, test_prompts)
total_time = time.time() - start_time
console.print(f"\n[bold green]✓ Training complete![/bold green]")
console.print(f" Total time: {total_time/60:.2f} minutes")
def demo_questions(model, tokenizer):
"""
Demonstrate the model answering questions.
Shows how well the model learned from TinyTalks by asking
various questions from different difficulty levels.
"""
console.print("\n" + "=" * 70)
console.print("[bold cyan]🤖 TinyBot Demo: Ask Me Questions![/bold cyan]")
console.print("=" * 70)
# Test questions from different levels
test_questions = [
"Q: Hello!",
"Q: What is your name?",
"Q: What color is the sky?",
"Q: How many legs does a dog have?",
"Q: What is 2 plus 3?",
"Q: What do you use a pen for?",
]
for question in test_questions:
console.print(f"\n[yellow]{question}[/yellow]")
# Generate answer
response = model.generate(tokenizer, prompt=question + "\nA:", max_new_tokens=50, temperature=0.8)
# Extract just the answer part
if "\nA:" in response:
answer = response.split("\nA:")[1].split("\n")[0].strip()
console.print(f"[green]A: {answer}[/green]")
else:
console.print(f"[dim]{response}[/dim]")
console.print("\n" + "=" * 70)
def main():
"""Main training pipeline"""
parser = argparse.ArgumentParser(description='Train TinyGPT on TinyTalks Q&A')
parser.add_argument('--epochs', type=int, default=30, help='Number of training epochs (default: 30)')
parser.add_argument('--batch-size', type=int, default=16, help='Batch size (default: 16)')
parser.add_argument('--lr', type=float, default=0.001, help='Learning rate (default: 0.001)')
parser.add_argument('--seq-length', type=int, default=64, help='Sequence length (default: 64)')
parser.add_argument('--embed-dim', type=int, default=96, help='Embedding dimension (default: 96, ~500K params)')
parser.add_argument('--num-layers', type=int, default=4, help='Number of transformer layers (default: 4)')
parser.add_argument('--num-heads', type=int, default=4, help='Number of attention heads (default: 4)')
parser.add_argument('--levels', type=str, default=None, help='Difficulty levels to train on (e.g. "1" or "1,2"). Default: all levels')
args = parser.parse_args()
# Parse levels argument
if args.levels:
levels = [int(l.strip()) for l in args.levels.split(',')]
else:
levels = None
print_banner()
# Import TinyTorch components
console.print("\n[bold]Importing TinyTorch components...[/bold]")
try:
from tinytorch.core.tensor import Tensor
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.text.tokenization import CharTokenizer
console.print("[green]✓[/green] All modules imported successfully!")
except ImportError as e:
console.print(f"[red]✗[/red] Import error: {e}")
console.print("\nMake sure you have completed all required modules:")
console.print(" - Module 01 (Tensor)")
console.print(" - Module 02 (Activations)")
console.print(" - Module 03 (Layers)")
console.print(" - Module 04 (Losses)")
console.print(" - Module 05 (Autograd)")
console.print(" - Module 06 (Optimizers)")
console.print(" - Module 10 (Tokenization)")
console.print(" - Module 11 (Embeddings)")
console.print(" - Module 12 (Attention)")
console.print(" - Module 13 (Transformers)")
return
# Load TinyTalks dataset
console.print("\n[bold]Loading TinyTalks dataset...[/bold]")
dataset_path = os.path.join(project_root, "datasets", "tinytalks", "splits", "train.txt")
if not os.path.exists(dataset_path):
console.print(f"[red]✗[/red] Dataset not found: {dataset_path}")
console.print("\nPlease generate the dataset first:")
console.print(" python datasets/tinytalks/scripts/generate_tinytalks.py")
return
with open(dataset_path, 'r', encoding='utf-8') as f:
text = f.read()
console.print(f"[green]✓[/green] Loaded dataset from: {os.path.basename(dataset_path)}")
console.print(f" File size: {len(text)} characters")
# Create dataset with level filtering
dataset = TinyTalksDataset(text, seq_length=args.seq_length, levels=levels)
# Set test prompts based on levels
if levels and 1 in levels:
test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: Hi!"]
elif levels and 2 in levels:
test_prompts = ["Q: What color is the sky?", "Q: How many legs does a dog have?"]
elif levels and 3 in levels:
test_prompts = ["Q: What is 2 plus 3?", "Q: What is 5 minus 2?"]
else:
test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: What color is the sky?"]
# Initialize model
console.print("\n[bold]Initializing TinyGPT model...[/bold]")
model = TinyGPT(
vocab_size=dataset.tokenizer.vocab_size,
embed_dim=args.embed_dim,
num_layers=args.num_layers,
num_heads=args.num_heads,
max_seq_len=args.seq_length,
dropout=0.1
)
# Initialize optimizer and loss
console.print("\n[bold]Initializing training components...[/bold]")
optimizer = Adam(model.parameters(), lr=args.lr)
criterion = CrossEntropyLoss()
console.print(f"[green]✓[/green] Optimizer: Adam (lr={args.lr})")
console.print(f"[green]✓[/green] Loss: CrossEntropyLoss")
# Print configuration
table = Table(title="Training Configuration", box=box.ROUNDED)
table.add_column("Parameter", style="cyan")
table.add_column("Value", style="green")
dataset_desc = f"TinyTalks Level(s) {levels}" if levels else "TinyTalks (All Levels)"
table.add_row("Dataset", dataset_desc)
table.add_row("Vocabulary Size", str(dataset.tokenizer.vocab_size))
table.add_row("Model Parameters", f"{model.count_parameters():,}")
table.add_row("Epochs", str(args.epochs))
table.add_row("Batch Size", str(args.batch_size))
table.add_row("Learning Rate", str(args.lr))
table.add_row("Sequence Length", str(args.seq_length))
table.add_row("Embedding Dim", str(args.embed_dim))
table.add_row("Layers", str(args.num_layers))
table.add_row("Attention Heads", str(args.num_heads))
table.add_row("Expected Time", "3-5 minutes")
console.print(table)
# Train model
train_tinytalks_gpt(
model=model,
dataset=dataset,
optimizer=optimizer,
criterion=criterion,
epochs=args.epochs,
batch_size=args.batch_size,
log_interval=50,
test_prompts=test_prompts
)
# Demo Q&A
demo_questions(model, dataset.tokenizer)
# Success message
console.print("\n[bold green]🎉 Congratulations![/bold green]")
console.print("You've successfully trained a transformer to answer questions!")
console.print("\nYou used:")
console.print(" ✓ YOUR Tensor implementation (Module 01)")
console.print(" ✓ YOUR Activations (Module 02)")
console.print(" ✓ YOUR Linear layers (Module 03)")
console.print(" ✓ YOUR CrossEntropyLoss (Module 04)")
console.print(" ✓ YOUR Autograd system (Module 05)")
console.print(" ✓ YOUR Adam optimizer (Module 06)")
console.print(" ✓ YOUR CharTokenizer (Module 10)")
console.print(" ✓ YOUR Embeddings (Module 11)")
console.print(" ✓ YOUR Multi-Head Attention (Module 12)")
console.print(" ✓ YOUR Transformer blocks (Module 13)")
console.print("\n[bold]This is the foundation of ChatGPT, built by YOU from scratch![/bold]")
if __name__ == "__main__":
main()

View File

@@ -1,427 +0,0 @@
"""
TinyTalks Interactive Learning Dashboard
=========================================
Watch a chatbot learn in real-time!
Students can see:
- Loss decreasing over time
- Responses improving from gibberish to coherent
- Learning progress at multiple checkpoints
- Interactive control (pause/continue)
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import numpy as np
import time
from tinytorch.core.tensor import Tensor
from tinytorch.core.autograd import enable_autograd
from tinytorch.core.optimizers import Adam
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.models.transformer import GPT
from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
enable_autograd()
try:
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich.live import Live
from rich.layout import Layout
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
RICH_AVAILABLE = True
except ImportError:
RICH_AVAILABLE = False
print("Note: Install 'rich' for better visualization: pip install rich")
# ============================================================================
# Tokenization (copied from tinytalks_chatbot.py)
# ============================================================================
def create_tokenizer(conversations):
"""Create character-level tokenizer with special tokens."""
all_text = ' '.join([q + ' ' + a for q, a in conversations])
all_chars = sorted(set(all_text))
special_tokens = {
'<PAD>': 0,
'<SOS>': 1,
'<SEP>': 2,
'<EOS>': 3,
}
char_to_idx = {**special_tokens}
idx_to_char = {v: k for k, v in special_tokens.items()}
for idx, char in enumerate(all_chars, start=len(special_tokens)):
char_to_idx[char] = idx
idx_to_char[idx] = char
return char_to_idx, idx_to_char
def encode_conversation(question, answer, char_to_idx, max_len=80):
"""Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>..."""
tokens = [char_to_idx['<SOS>']]
for c in question:
tokens.append(char_to_idx.get(c, 0))
tokens.append(char_to_idx['<SEP>'])
for c in answer:
tokens.append(char_to_idx.get(c, 0))
tokens.append(char_to_idx['<EOS>'])
if len(tokens) < max_len:
tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
else:
tokens = tokens[:max_len]
return tokens
def decode_tokens(tokens, idx_to_char):
"""Decode tokens to string."""
chars = []
for t in tokens:
if t == 0 or t == 1: # PAD or SOS
continue
elif t == 2: # SEP
continue
elif t == 3: # EOS
break
else:
chars.append(idx_to_char.get(t, '?'))
return ''.join(chars)
def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
"""Generate response to a question."""
tokens = [char_to_idx['<SOS>']]
for c in question:
tokens.append(char_to_idx.get(c, 0))
tokens.append(char_to_idx['<SEP>'])
generated_tokens = []
for _ in range(max_len):
input_tokens = tokens + generated_tokens
while len(input_tokens) < 80:
input_tokens.append(char_to_idx['<PAD>'])
input_tokens = input_tokens[:80]
x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
logits = model.forward(x)
next_pos = len(tokens) + len(generated_tokens) - 1
if next_pos < logits.shape[1]:
next_logits = logits.data[0, next_pos, :]
next_token = int(np.argmax(next_logits))
if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
break
generated_tokens.append(next_token)
else:
break
response = decode_tokens(generated_tokens, idx_to_char)
return response
# ============================================================================
# Interactive Training with Checkpoints
# ============================================================================
def evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char):
"""Evaluate model on test questions."""
results = []
for question in test_questions:
response = generate_response(model, question, char_to_idx, idx_to_char)
results.append((question, response))
return results
def show_checkpoint_panel(checkpoint_num, step, loss, results, prev_results=None):
"""Show checkpoint results in a nice panel."""
if RICH_AVAILABLE:
console = Console()
# Header
console.print()
console.print("=" * 70, style="bold cyan")
console.print(f"CHECKPOINT {checkpoint_num} - Step {step:,} | Loss: {loss:.4f}",
style="bold yellow", justify="center")
console.print("=" * 70, style="bold cyan")
console.print()
# Show responses
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Question", style="cyan", width=25)
table.add_column("Response", style="green", width=35)
if prev_results:
table.add_column("Previous", style="dim", width=10)
for i, (question, response) in enumerate(results):
if prev_results and i < len(prev_results):
prev_response = prev_results[i][1]
improved = "📈" if len(response) > len(prev_response) else "📉"
table.add_row(question, response, improved)
else:
table.add_row(question, response)
console.print(table)
console.print()
else:
# Fallback to simple print
print()
print("=" * 70)
print(f"CHECKPOINT {checkpoint_num} - Step {step:,} | Loss: {loss:.4f}")
print("=" * 70)
print()
for question, response in results:
print(f"Q: {question}")
print(f"A: {response}")
print()
def train_interactive(model, optimizer, loss_fn, train_data, test_questions,
char_to_idx, idx_to_char, max_time_minutes=15,
checkpoint_steps=1000, auto_continue_seconds=10):
"""
Train with interactive checkpoints.
Args:
checkpoint_steps: Pause every N steps to show results
auto_continue_seconds: Auto-continue after N seconds (0 = wait for ENTER)
"""
max_time_seconds = max_time_minutes * 60
print("=" * 70)
print(f"INTERACTIVE TRAINING - {max_time_minutes} MINUTES")
print("=" * 70)
print(f"Dataset: {len(train_data)} conversations")
print(f"Checkpoints: Every {checkpoint_steps} steps")
print(f"Auto-continue: {auto_continue_seconds}s (or press ENTER)")
print("=" * 70)
print()
print("Watch the model learn from gibberish to coherent responses!")
print()
# Initial evaluation (before training)
print("Evaluating initial model (untrained)...")
initial_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
show_checkpoint_panel(0, 0, 999.9, initial_results)
if auto_continue_seconds > 0:
print(f"Starting training in {auto_continue_seconds} seconds (or press ENTER)...")
time.sleep(auto_continue_seconds)
elif auto_continue_seconds == 0:
print("Starting training immediately...")
time.sleep(0.5)
else:
input("Press ENTER to start training...")
print()
print("Training started...")
print()
start_time = time.time()
losses = []
step = 0
checkpoint_num = 1
prev_results = initial_results
next_checkpoint = checkpoint_steps
while True:
elapsed = time.time() - start_time
if elapsed >= max_time_seconds:
break
# Training step
tokens = train_data[np.random.randint(len(train_data))]
input_seq = tokens[:-1]
target_seq = tokens[1:]
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
logits = model.forward(x)
batch_size, seq_len, vocab_size = logits.shape
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
targets_flat = y_true.reshape(batch_size * seq_len)
loss = loss_fn.forward(logits_flat, targets_flat)
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
if param.grad is not None:
np.clip(param.grad, -1.0, 1.0, out=param.grad)
optimizer.step()
losses.append(loss.data.item())
step += 1
# Show progress every 100 steps
if step % 100 == 0:
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
print(f"[{int(elapsed):4d}s] Step {step:5d} | Loss: {avg_loss:.4f}")
# Checkpoint evaluation
if step >= next_checkpoint:
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
print()
print(f"Evaluating at step {step}...")
current_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
show_checkpoint_panel(checkpoint_num, step, avg_loss, current_results, prev_results)
prev_results = current_results
checkpoint_num += 1
next_checkpoint += checkpoint_steps
# Interactive pause
if auto_continue_seconds > 0:
print(f"Continuing in {auto_continue_seconds}s (or press ENTER)...")
time.sleep(auto_continue_seconds)
elif auto_continue_seconds == 0:
print("Continuing immediately...")
time.sleep(0.5)
else:
input("Press ENTER to continue training...")
print()
print("Training resumed...")
print()
# Final results
final_elapsed = time.time() - start_time
final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
initial_loss = np.mean(losses[:10])
improvement = (1 - final_loss / initial_loss) * 100
print()
print("=" * 70)
print("TRAINING COMPLETE!")
print("=" * 70)
print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.1f} minutes)")
print(f"Total steps: {step:,}")
print(f"Initial loss: {initial_loss:.4f}")
print(f"Final loss: {final_loss:.4f}")
print(f"Improvement: {improvement:.1f}%")
print()
# Final evaluation
print("Final evaluation...")
final_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
show_checkpoint_panel("FINAL", step, final_loss, final_results, prev_results)
return losses, step
# ============================================================================
# Main
# ============================================================================
def main():
print()
print("=" * 70)
print("TINYTALKS INTERACTIVE LEARNING DASHBOARD")
print("=" * 70)
print()
print("Watch a transformer learn to chat in real-time!")
print("You'll see responses improve from gibberish to coherent answers.")
print()
# Dataset
conversations = create_tinytalks_dataset()
stats = get_dataset_stats()
print(f"Dataset: {stats['total_examples']} examples ({stats['unique_examples']} unique)")
print()
# Tokenizer
char_to_idx, idx_to_char = create_tokenizer(conversations)
vocab_size = len(idx_to_char)
# Encode
max_seq_len = 80
train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
# Test questions for checkpoints
test_questions = [
"Hi",
"How are you",
"What is your name",
"What is the sky",
"Is grass green",
]
# Model: Ultra-tiny for speed
config = {
'vocab_size': vocab_size,
'embed_dim': 16,
'num_layers': 1,
'num_heads': 2,
'max_seq_len': max_seq_len,
}
model = GPT(**config)
num_params = sum(np.prod(p.shape) for p in model.parameters())
print(f"Model: {num_params:,} parameters")
print()
# Optimizer
optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = CrossEntropyLoss()
# Settings
train_time = 5 # minutes (shorter for demo)
checkpoint_steps = 1000 # Evaluate every 1000 steps (~1-2 minutes)
auto_continue = 0 # Auto-continue immediately (0 = no wait for demo)
print(f"Training for {train_time} minutes")
print(f"Checkpoints every {checkpoint_steps} steps")
print()
# Train with interactive checkpoints
losses, total_steps = train_interactive(
model=model,
optimizer=optimizer,
loss_fn=loss_fn,
train_data=train_data,
test_questions=test_questions,
char_to_idx=char_to_idx,
idx_to_char=idx_to_char,
max_time_minutes=train_time,
checkpoint_steps=checkpoint_steps,
auto_continue_seconds=auto_continue
)
print()
print("=" * 70)
print("DEMO COMPLETE!")
print("=" * 70)
print()
print("You just watched a transformer learn from scratch!")
print(f"{total_steps:,} training steps")
print(f"{len(losses)} loss values")
print(f"{(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}% improvement")
print()
print("Key takeaway: Loss decrease = Better responses!")
print()
if __name__ == "__main__":
main()

View File

@@ -1,336 +0,0 @@
#!/usr/bin/env python3
"""
Monitored Training Script for TinyTalks
========================================
Features:
- Early stopping if loss doesn't improve
- Continuous progress monitoring
- Automatic experiment termination for bad runs
- Clear feedback on learning progress
Usage:
python train_monitored.py --mode test # 10 epochs, quick validation
python train_monitored.py --mode full # 30 epochs, full training
"""
import sys
import os
import argparse
import time
import numpy as np
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn
from rich.table import Table
from rich import box
# Import TinyTorch components
from tinytorch.core.tensor import Tensor
from tinytorch.core.autograd import enable_autograd
from tinytorch.core.losses import CrossEntropyLoss
from tinytorch.core.optimizers import Adam
from tinytorch.text.tokenization import CharTokenizer
console = Console()
# Import TinyGPT and dataset classes
exec(open(project_root / "milestones/05_2017_transformer/tinytalks_gpt.py").read())
class TrainingMonitor:
"""Monitor training progress and implement early stopping"""
def __init__(self, patience=5, min_delta=0.01):
"""
Args:
patience: Number of checks without improvement before stopping
min_delta: Minimum change in loss to count as improvement
"""
self.patience = patience
self.min_delta = min_delta
self.best_loss = float('inf')
self.checks_without_improvement = 0
self.losses = []
def check(self, current_loss):
"""
Check if training should continue
Returns:
(should_continue, message)
"""
self.losses.append(current_loss)
# Calculate improvement
improvement = self.best_loss - current_loss
if improvement > self.min_delta:
# Significant improvement
self.best_loss = current_loss
self.checks_without_improvement = 0
return True, f"✓ Loss improved by {improvement:.4f}"
else:
# No significant improvement
self.checks_without_improvement += 1
if self.checks_without_improvement >= self.patience:
return False, f"✗ No improvement for {self.patience} checks. Stopping."
else:
return True, f"⚠ No improvement ({self.checks_without_improvement}/{self.patience})"
def summary(self):
"""Get training summary"""
if len(self.losses) < 2:
return "Not enough data"
initial = self.losses[0]
final = self.losses[-1]
best = min(self.losses)
decrease = initial - final
decrease_pct = (decrease / initial) * 100 if initial > 0 else 0
return {
'initial_loss': initial,
'final_loss': final,
'best_loss': best,
'total_decrease': decrease,
'decrease_percent': decrease_pct,
'num_checks': len(self.losses)
}
def train_with_monitoring(model, dataset, optimizer, criterion, config, monitor):
"""
Train with continuous monitoring and early stopping
Args:
model: TinyGPT model
dataset: TinyTalksDataset
optimizer: Adam optimizer
criterion: CrossEntropyLoss
config: Training configuration dict
monitor: TrainingMonitor instance
Returns:
success: True if training completed successfully
"""
epochs = config['epochs']
batch_size = config['batch_size']
check_interval = config.get('check_interval', 50) # Check every N batches
console.print(f"\n[bold cyan]Starting Training with Monitoring[/bold cyan]")
console.print(f" Check interval: Every {check_interval} batches")
console.print(f" Early stopping: {monitor.patience} checks without improvement\n")
total_batches_processed = 0
start_time = time.time()
for epoch in range(epochs):
epoch_start = time.time()
epoch_loss = 0.0
batch_count = 0
console.print(f"[bold]Epoch {epoch+1}/{epochs}[/bold]")
# Create batches
num_sequences = len(dataset)
indices = np.random.permutation(num_sequences)
for batch_start in range(0, num_sequences, batch_size):
batch_end = min(batch_start + batch_size, num_sequences)
batch_indices = indices[batch_start:batch_end]
# Get batch data
batch_inputs = []
batch_targets = []
for idx in batch_indices:
input_seq, target_seq = dataset[idx]
batch_inputs.append(input_seq)
batch_targets.append(target_seq)
# Convert to tensors
batch_input = Tensor(np.array(batch_inputs))
batch_target = Tensor(np.array(batch_targets))
# Forward pass
logits = model.forward(batch_input)
# Reshape for loss
batch_size_actual, seq_length, vocab_size = logits.shape
logits_2d = logits.reshape(batch_size_actual * seq_length, vocab_size)
targets_1d = batch_target.reshape(-1)
# Compute loss
loss = criterion.forward(logits_2d, targets_1d)
# Backward and optimize
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Track loss
loss_value = float(loss.data)
epoch_loss += loss_value
batch_count += 1
total_batches_processed += 1
# Monitor progress at check intervals
if total_batches_processed % check_interval == 0:
avg_loss = epoch_loss / batch_count
should_continue, message = monitor.check(avg_loss)
elapsed = time.time() - start_time
console.print(f" Batch {total_batches_processed} | Loss: {avg_loss:.4f} | {message} | Time: {elapsed:.1f}s")
if not should_continue:
console.print(f"\n[yellow]Early stopping triggered at epoch {epoch+1}, batch {batch_count}[/yellow]")
return False
# Epoch summary
avg_epoch_loss = epoch_loss / batch_count
epoch_time = time.time() - epoch_start
console.print(f" → Epoch {epoch+1} complete: Avg Loss = {avg_epoch_loss:.4f} | Time: {epoch_time:.1f}s\n")
console.print(f"[green]✓ Training completed successfully![/green]\n")
return True
def main():
parser = argparse.ArgumentParser(description='Monitored TinyTalks Training')
parser.add_argument('--mode', choices=['test', 'full'], default='test',
help='Training mode: test (10 epochs) or full (30 epochs)')
parser.add_argument('--patience', type=int, default=5,
help='Early stopping patience (checks without improvement)')
parser.add_argument('--min-delta', type=float, default=0.01,
help='Minimum loss decrease to count as improvement')
parser.add_argument('--check-interval', type=int, default=50,
help='Check progress every N batches')
args = parser.parse_args()
# Enable autograd
enable_autograd()
# Configuration based on mode
if args.mode == 'test':
config = {
'epochs': 10,
'batch_size': 32,
'lr': 0.001,
'embed_dim': 128,
'num_layers': 6,
'num_heads': 8,
'check_interval': args.check_interval,
'mode': 'TEST (Quick Validation)'
}
else: # full
config = {
'epochs': 30,
'batch_size': 32,
'lr': 0.001,
'embed_dim': 128,
'num_layers': 6,
'num_heads': 8,
'check_interval': args.check_interval,
'mode': 'FULL (Complete Training)'
}
# Display configuration
console.print("\n[bold cyan]═══════════════════════════════════════════════════[/bold cyan]")
console.print("[bold cyan] Monitored TinyTalks Training - Option C [/bold cyan]")
console.print("[bold cyan]═══════════════════════════════════════════════════[/bold cyan]\n")
table = Table(box=box.ROUNDED)
table.add_column("Parameter", style="cyan")
table.add_column("Value", style="yellow")
table.add_row("Mode", config['mode'])
table.add_row("Epochs", str(config['epochs']))
table.add_row("Batch Size", str(config['batch_size']))
table.add_row("Learning Rate", str(config['lr']))
table.add_row("Model Size", f"{config['embed_dim']}d, {config['num_layers']}L, {config['num_heads']}H")
table.add_row("Early Stopping Patience", str(args.patience))
table.add_row("Min Delta", str(args.min_delta))
table.add_row("Check Interval", f"Every {args.check_interval} batches")
console.print(table)
console.print()
# Load dataset
console.print("[bold]Loading TinyTalks dataset...[/bold]")
dataset_path = project_root / "datasets/tinytalks/splits/train.txt"
with open(dataset_path, 'r') as f:
text = f.read()
dataset = TinyTalksDataset(text, seq_length=64)
console.print(f" ✓ Loaded: {len(text):,} chars, {dataset.tokenizer.vocab_size} vocab\n")
# Initialize model
console.print("[bold]Initializing model...[/bold]")
model = TinyGPT(
vocab_size=dataset.tokenizer.vocab_size,
embed_dim=config['embed_dim'],
num_layers=config['num_layers'],
num_heads=config['num_heads'],
max_seq_len=64
)
params = model.parameters()
param_count = sum(p.data.size for p in params)
console.print(f" ✓ Model initialized: {param_count:,} parameters\n")
# Initialize training components
optimizer = Adam(params, lr=config['lr'])
criterion = CrossEntropyLoss()
monitor = TrainingMonitor(patience=args.patience, min_delta=args.min_delta)
# Train
console.print("[bold]Starting training...[/bold]\n")
start_time = time.time()
success = train_with_monitoring(model, dataset, optimizer, criterion, config, monitor)
total_time = time.time() - start_time
# Summary
console.print("\n[bold cyan]═══════════════════════════════════════════════════[/bold cyan]")
console.print("[bold cyan] Training Summary [/bold cyan]")
console.print("[bold cyan]═══════════════════════════════════════════════════[/bold cyan]\n")
summary = monitor.summary()
result_table = Table(box=box.ROUNDED)
result_table.add_column("Metric", style="cyan")
result_table.add_column("Value", style="yellow")
result_table.add_row("Status", "✓ SUCCESS" if success else "⚠ EARLY STOP")
result_table.add_row("Total Time", f"{total_time/60:.1f} minutes")
result_table.add_row("Initial Loss", f"{summary['initial_loss']:.4f}")
result_table.add_row("Final Loss", f"{summary['final_loss']:.4f}")
result_table.add_row("Best Loss", f"{summary['best_loss']:.4f}")
result_table.add_row("Total Decrease", f"{summary['total_decrease']:.4f} ({summary['decrease_percent']:.1f}%)")
result_table.add_row("Checks Performed", str(summary['num_checks']))
console.print(result_table)
console.print()
# Recommendation
if success and summary['decrease_percent'] > 50:
console.print("[bold green]✓ EXCELLENT: Model is learning well! Continue with full training.[/bold green]")
elif success and summary['decrease_percent'] > 20:
console.print("[bold yellow]⚠ MODERATE: Model is learning but slowly. Consider tuning hyperparameters.[/bold yellow]")
elif success:
console.print("[bold red]✗ POOR: Model not learning effectively. Needs hyperparameter adjustment.[/bold red]")
else:
console.print("[bold red]✗ FAILED: Training stopped early. Try different hyperparameters.[/bold red]")
if __name__ == "__main__":
main()