mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-03 13:59:36 -05:00
Remove non-Vaswani transformer examples
Keep only the three Vaswani examples that reference the 2017 Attention Is All You Need paper: - vaswani_chatgpt.py (Q&A generation) - vaswani_copilot.py (Python autocomplete) - vaswani_shakespeare.py (text generation) Removed 14 redundant example files
This commit is contained in:
@@ -1,75 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download and prepare TinyStories dataset for TinyTorch training.
|
||||
|
||||
TinyStories is a dataset of simple, synthetic stories designed for
|
||||
training small language models. It's much easier than Shakespeare!
|
||||
"""
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
def download_tinystories():
|
||||
"""Download TinyStories dataset."""
|
||||
|
||||
# Create data directory
|
||||
data_dir = os.path.join(os.path.dirname(__file__), '../datasets/tinystories')
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
|
||||
# TinyStories validation set (smaller, good for testing)
|
||||
urls = {
|
||||
'tiny_val': 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-valid.txt',
|
||||
'tiny_train_small': 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories-train.txt'
|
||||
}
|
||||
|
||||
print("📥 Downloading TinyStories dataset...")
|
||||
print("="*70)
|
||||
|
||||
# Start with validation set (much smaller for testing)
|
||||
filename = 'tinystories_val.txt'
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
|
||||
if os.path.exists(filepath):
|
||||
print(f"✅ {filename} already exists")
|
||||
size = os.path.getsize(filepath) / (1024 * 1024)
|
||||
print(f" Size: {size:.2f} MB")
|
||||
else:
|
||||
print(f"⬇️ Downloading {filename}...")
|
||||
try:
|
||||
urllib.request.urlretrieve(urls['tiny_val'], filepath)
|
||||
size = os.path.getsize(filepath) / (1024 * 1024)
|
||||
print(f"✅ Downloaded! Size: {size:.2f} MB")
|
||||
except Exception as e:
|
||||
print(f"❌ Error downloading: {e}")
|
||||
print("\n💡 Alternative: Download manually from:")
|
||||
print(f" {urls['tiny_val']}")
|
||||
print(f" Save to: {filepath}")
|
||||
return None
|
||||
|
||||
# Read and show sample
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
|
||||
print(f"\n📊 Dataset Stats:")
|
||||
print(f" Total characters: {len(text):,}")
|
||||
print(f" Total words: {len(text.split()):,}")
|
||||
print(f" Unique characters: {len(set(text))}")
|
||||
|
||||
# Show first story
|
||||
stories = text.split('<|endoftext|>')
|
||||
if len(stories) > 0:
|
||||
first_story = stories[0].strip()
|
||||
print(f"\n📖 Sample Story:")
|
||||
print(" " + "-"*66)
|
||||
print(" " + first_story[:300].replace('\n', '\n '))
|
||||
if len(first_story) > 300:
|
||||
print(" ...")
|
||||
print(" " + "-"*66)
|
||||
|
||||
print(f"\n✅ TinyStories ready for training!")
|
||||
print(f" Location: {filepath}")
|
||||
|
||||
return filepath
|
||||
|
||||
if __name__ == '__main__':
|
||||
download_tinystories()
|
||||
@@ -1,338 +0,0 @@
|
||||
"""
|
||||
Milestone 05 - Level 1: Transformer Memorization Test
|
||||
======================================================
|
||||
|
||||
SIMPLEST POSSIBLE TRANSFORMER TEST:
|
||||
Can the transformer memorize and reproduce simple sequences?
|
||||
|
||||
Task: Given "ABCD", predict "BCDE"
|
||||
Given "1234", predict "2345"
|
||||
|
||||
Expected:
|
||||
- Train in < 2 minutes
|
||||
- Loss should drop from ~3.0 to < 0.1
|
||||
- Should perfectly predict next character
|
||||
|
||||
This validates:
|
||||
✓ Transformer architecture works
|
||||
✓ Attention mechanism works
|
||||
✓ Gradient flow works
|
||||
✓ Training loop works
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.models.transformer import GPT
|
||||
|
||||
enable_autograd()
|
||||
|
||||
# ============================================================================
|
||||
# Level 1: Simple Memorization Dataset
|
||||
# ============================================================================
|
||||
|
||||
def create_memorization_dataset():
|
||||
"""
|
||||
Create ultra-simple sequences to memorize:
|
||||
- Alphabet sequences: ABCD, EFGH, etc.
|
||||
- Number sequences: 1234, 5678, etc.
|
||||
- Pattern sequences: AAAA, BBBB, etc.
|
||||
"""
|
||||
sequences = [
|
||||
# Alphabet
|
||||
"ABCDE",
|
||||
"FGHIJ",
|
||||
"KLMNO",
|
||||
"PQRST",
|
||||
"UVWXY",
|
||||
# Numbers
|
||||
"12345",
|
||||
"67890",
|
||||
# Patterns
|
||||
"AAAAA",
|
||||
"BBBBB",
|
||||
"CCCCC",
|
||||
# Mixed
|
||||
"A1B2C",
|
||||
"X9Y8Z",
|
||||
]
|
||||
return sequences
|
||||
|
||||
|
||||
def create_simple_tokenizer(sequences):
|
||||
"""Create character-level tokenizer for sequences."""
|
||||
# Get all unique characters
|
||||
all_chars = sorted(set(''.join(sequences)))
|
||||
|
||||
# Create mappings (0 is reserved for padding)
|
||||
char_to_idx = {char: idx + 1 for idx, char in enumerate(all_chars)}
|
||||
idx_to_char = {idx + 1: char for idx, char in enumerate(all_chars)}
|
||||
char_to_idx['<PAD>'] = 0
|
||||
idx_to_char[0] = '<PAD>'
|
||||
|
||||
return char_to_idx, idx_to_char
|
||||
|
||||
|
||||
def encode_sequence(seq, char_to_idx, max_len=8):
|
||||
"""Encode sequence to token IDs."""
|
||||
tokens = [char_to_idx.get(c, 0) for c in seq]
|
||||
# Pad to max_len
|
||||
if len(tokens) < max_len:
|
||||
tokens = tokens + [0] * (max_len - len(tokens))
|
||||
else:
|
||||
tokens = tokens[:max_len]
|
||||
return tokens
|
||||
|
||||
|
||||
def decode_sequence(tokens, idx_to_char):
|
||||
"""Decode token IDs to string."""
|
||||
chars = [idx_to_char.get(t, '') for t in tokens if t != 0]
|
||||
return ''.join(chars)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Training
|
||||
# ============================================================================
|
||||
|
||||
def train_memorization(model, optimizer, loss_fn, train_data, vocab_size, max_steps=200):
|
||||
"""
|
||||
Train transformer to memorize sequences.
|
||||
Target: < 2 minutes, loss < 0.1
|
||||
"""
|
||||
print("=" * 70)
|
||||
print("TRAINING LEVEL 1: MEMORIZATION")
|
||||
print("=" * 70)
|
||||
print(f"Dataset: {len(train_data)} sequences")
|
||||
print(f"Vocab size: {vocab_size}")
|
||||
print(f"Max steps: {max_steps}")
|
||||
print(f"Target: Loss < 0.1 in < 2 minutes")
|
||||
print()
|
||||
|
||||
start_time = time.time()
|
||||
losses = []
|
||||
|
||||
for step in range(max_steps):
|
||||
# Sample random sequence
|
||||
tokens = train_data[np.random.randint(len(train_data))]
|
||||
|
||||
# Input: all but last token
|
||||
# Target: all but first token (next token prediction)
|
||||
input_seq = tokens[:-1]
|
||||
target_seq = tokens[1:]
|
||||
|
||||
# Convert to tensors
|
||||
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
|
||||
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
|
||||
|
||||
# Forward pass
|
||||
logits = model.forward(x)
|
||||
|
||||
# Compute loss
|
||||
batch_size, seq_len, vocab_size_out = logits.shape
|
||||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size_out)
|
||||
targets_flat = y_true.reshape(batch_size * seq_len)
|
||||
loss = loss_fn.forward(logits_flat, targets_flat)
|
||||
|
||||
# Backward pass
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
# Clip gradients
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
np.clip(param.grad, -1.0, 1.0, out=param.grad)
|
||||
|
||||
# Update
|
||||
optimizer.step()
|
||||
|
||||
losses.append(loss.data.item())
|
||||
|
||||
# Progress every 50 steps
|
||||
if step % 50 == 0:
|
||||
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
elapsed = time.time() - start_time
|
||||
print(f"Step {step:4d}/{max_steps} | Loss: {avg_loss:.4f} | Time: {elapsed:.1f}s")
|
||||
|
||||
# Early stopping
|
||||
if avg_loss < 0.2:
|
||||
print(f"\n✓ Target reached! Loss < 0.2 at step {step}")
|
||||
break
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
final_loss = np.mean(losses[-100:])
|
||||
initial_loss = np.mean(losses[:10])
|
||||
improvement = (1 - final_loss / initial_loss) * 100
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("TRAINING COMPLETE")
|
||||
print("=" * 70)
|
||||
print(f"Time: {elapsed:.1f} seconds")
|
||||
print(f"Initial loss: {initial_loss:.4f}")
|
||||
print(f"Final loss: {final_loss:.4f}")
|
||||
print(f"Improvement: {improvement:.1f}%")
|
||||
print()
|
||||
|
||||
return losses
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Testing
|
||||
# ============================================================================
|
||||
|
||||
def test_memorization(model, test_sequences, char_to_idx, idx_to_char):
|
||||
"""
|
||||
Test if model can reproduce memorized sequences.
|
||||
"""
|
||||
print("=" * 70)
|
||||
print("TESTING LEVEL 1: MEMORIZATION")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
correct = 0
|
||||
total = len(test_sequences)
|
||||
|
||||
for seq in test_sequences:
|
||||
# Encode
|
||||
tokens = encode_sequence(seq, char_to_idx, max_len=8)
|
||||
|
||||
# Get model predictions
|
||||
x = Tensor(np.array([tokens[:-1]], dtype=np.int32), requires_grad=False)
|
||||
logits = model.forward(x)
|
||||
|
||||
# Decode predictions (greedy)
|
||||
predicted_tokens = []
|
||||
for i in range(logits.shape[1]):
|
||||
next_token = int(np.argmax(logits.data[0, i, :]))
|
||||
predicted_tokens.append(next_token)
|
||||
|
||||
# Compare
|
||||
expected = tokens[1:] # Target sequence
|
||||
predicted = predicted_tokens
|
||||
|
||||
# Check if match (ignoring padding)
|
||||
match = True
|
||||
for exp, pred in zip(expected, predicted):
|
||||
if exp == 0: # Padding, stop checking
|
||||
break
|
||||
if exp != pred:
|
||||
match = False
|
||||
break
|
||||
|
||||
if match:
|
||||
correct += 1
|
||||
status = "✓"
|
||||
else:
|
||||
status = "✗"
|
||||
|
||||
# Decode for display
|
||||
expected_str = decode_sequence(expected, idx_to_char)
|
||||
predicted_str = decode_sequence(predicted, idx_to_char)
|
||||
|
||||
print(f"{status} Input: {seq[:4]:8s} → Expected: {expected_str:8s} | Got: {predicted_str:8s}")
|
||||
|
||||
accuracy = (correct / total) * 100
|
||||
print()
|
||||
print(f"Accuracy: {correct}/{total} ({accuracy:.1f}%)")
|
||||
print()
|
||||
|
||||
if accuracy >= 90:
|
||||
print("✓ LEVEL 1 PASSED: Transformer can memorize sequences!")
|
||||
else:
|
||||
print("✗ LEVEL 1 FAILED: Needs more training or debugging")
|
||||
|
||||
return accuracy
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("MILESTONE 05 - LEVEL 1: TRANSFORMER MEMORIZATION TEST")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print("Goal: Train transformer to memorize simple sequences in < 2 minutes")
|
||||
print()
|
||||
|
||||
# Create dataset
|
||||
sequences = create_memorization_dataset()
|
||||
char_to_idx, idx_to_char = create_simple_tokenizer(sequences)
|
||||
vocab_size = len(idx_to_char)
|
||||
|
||||
print(f"Dataset: {len(sequences)} sequences")
|
||||
print(f"Vocabulary: {vocab_size} tokens")
|
||||
print(f"Example: {sequences[0]} → {encode_sequence(sequences[0], char_to_idx)}")
|
||||
print()
|
||||
|
||||
# Encode all sequences
|
||||
train_data = [encode_sequence(seq, char_to_idx, max_len=8) for seq in sequences]
|
||||
|
||||
# Create ULTRA-tiny model for speed
|
||||
config = {
|
||||
'vocab_size': vocab_size,
|
||||
'embed_dim': 16, # Super tiny!
|
||||
'num_layers': 1, # Just 1 layer
|
||||
'num_heads': 2, # 2 heads
|
||||
'max_seq_len': 8, # Short sequences
|
||||
}
|
||||
|
||||
print("Model configuration:")
|
||||
for key, val in config.items():
|
||||
print(f" {key}: {val}")
|
||||
print()
|
||||
|
||||
model = GPT(**config)
|
||||
num_params = sum(np.prod(p.shape) for p in model.parameters())
|
||||
print(f"Parameters: {num_params:,}")
|
||||
print()
|
||||
|
||||
# Optimizer and loss
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Train
|
||||
print("Starting training...")
|
||||
print()
|
||||
losses = train_memorization(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
loss_fn=loss_fn,
|
||||
train_data=train_data,
|
||||
vocab_size=vocab_size,
|
||||
max_steps=200 # Reduced for speed (ultra-tiny model)
|
||||
)
|
||||
|
||||
# Test
|
||||
print("Starting testing...")
|
||||
print()
|
||||
accuracy = test_memorization(model, sequences, char_to_idx, idx_to_char)
|
||||
|
||||
# Summary
|
||||
print("=" * 70)
|
||||
print("LEVEL 1 SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"✓ Training: {len(losses)} steps")
|
||||
print(f"✓ Loss: {np.mean(losses[:10]):.4f} → {np.mean(losses[-100:]):.4f}")
|
||||
print(f"✓ Accuracy: {accuracy:.1f}%")
|
||||
print()
|
||||
|
||||
if accuracy >= 90:
|
||||
print("🎉 LEVEL 1 COMPLETE! Ready for Level 2: Pattern Completion")
|
||||
else:
|
||||
print("⚠️ LEVEL 1 INCOMPLETE: Needs debugging")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,357 +0,0 @@
|
||||
"""
|
||||
Milestone 05 - Level 2: Transformer Pattern Completion
|
||||
=======================================================
|
||||
|
||||
SIMPLE PATTERN COMPLETION TEST:
|
||||
Can the transformer learn to complete simple patterns?
|
||||
|
||||
Task: Given "A B C", predict "D"
|
||||
Given "1 2 3", predict "4"
|
||||
Given "do re mi", predict "fa"
|
||||
|
||||
Expected:
|
||||
- Train in < 5 minutes
|
||||
- Loss should drop from ~3.0 to < 0.5
|
||||
- Should complete 70%+ of patterns correctly
|
||||
|
||||
This validates:
|
||||
✓ Transformer can learn relationships
|
||||
✓ Attention mechanism captures patterns
|
||||
✓ Model generalizes beyond memorization
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.models.transformer import GPT
|
||||
|
||||
enable_autograd()
|
||||
|
||||
# ============================================================================
|
||||
# Level 2: Pattern Completion Dataset
|
||||
# ============================================================================
|
||||
|
||||
def create_pattern_dataset():
|
||||
"""
|
||||
Create simple completion patterns:
|
||||
- Sequences: A B C → D
|
||||
- Counting: 1 2 3 → 4
|
||||
- Musical: do re mi → fa
|
||||
"""
|
||||
patterns = [
|
||||
# Alphabet sequences
|
||||
("A B C", "D"),
|
||||
("D E F", "G"),
|
||||
("M N O", "P"),
|
||||
("W X Y", "Z"),
|
||||
# Numbers
|
||||
("1 2 3", "4"),
|
||||
("5 6 7", "8"),
|
||||
# Words (short)
|
||||
("cat dog", "rat"),
|
||||
("up down", "left"),
|
||||
# Repetition
|
||||
("A A A", "A"),
|
||||
("B B B", "B"),
|
||||
("1 1 1", "1"),
|
||||
]
|
||||
return patterns
|
||||
|
||||
|
||||
def create_tokenizer(patterns):
|
||||
"""Create character-level tokenizer."""
|
||||
# Get all unique characters
|
||||
all_text = ' '.join([p[0] + ' ' + p[1] for p in patterns])
|
||||
all_chars = sorted(set(all_text))
|
||||
|
||||
# Create mappings (0 = padding, 1 = EOS)
|
||||
char_to_idx = {char: idx + 2 for idx, char in enumerate(all_chars)}
|
||||
idx_to_char = {idx + 2: char for idx, char in enumerate(all_chars)}
|
||||
char_to_idx['<PAD>'] = 0
|
||||
char_to_idx['<EOS>'] = 1
|
||||
idx_to_char[0] = '<PAD>'
|
||||
idx_to_char[1] = '<EOS>'
|
||||
|
||||
return char_to_idx, idx_to_char
|
||||
|
||||
|
||||
def encode_pattern(input_str, target_str, char_to_idx, max_len=16):
|
||||
"""Encode pattern as: input + <EOS> + target + <EOS>, then pad."""
|
||||
# Encode input
|
||||
input_tokens = [char_to_idx.get(c, 0) for c in input_str]
|
||||
input_tokens.append(1) # EOS
|
||||
|
||||
# Encode target
|
||||
target_tokens = [char_to_idx.get(c, 0) for c in target_str]
|
||||
target_tokens.append(1) # EOS
|
||||
|
||||
# Combine
|
||||
tokens = input_tokens + target_tokens
|
||||
|
||||
# Pad
|
||||
if len(tokens) < max_len:
|
||||
tokens = tokens + [0] * (max_len - len(tokens))
|
||||
else:
|
||||
tokens = tokens[:max_len]
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def decode_tokens(tokens, idx_to_char):
|
||||
"""Decode tokens to string."""
|
||||
chars = []
|
||||
for t in tokens:
|
||||
if t == 0: # padding
|
||||
break
|
||||
if t == 1: # EOS
|
||||
break
|
||||
chars.append(idx_to_char.get(t, '?'))
|
||||
return ''.join(chars)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Training
|
||||
# ============================================================================
|
||||
|
||||
def train_patterns(model, optimizer, loss_fn, train_data, vocab_size, max_steps=400):
|
||||
"""
|
||||
Train transformer to complete patterns.
|
||||
Target: < 5 minutes, loss < 0.5
|
||||
"""
|
||||
print("=" * 70)
|
||||
print("TRAINING LEVEL 2: PATTERN COMPLETION")
|
||||
print("=" * 70)
|
||||
print(f"Dataset: {len(train_data)} patterns")
|
||||
print(f"Vocab size: {vocab_size}")
|
||||
print(f"Max steps: {max_steps}")
|
||||
print(f"Target: Loss < 0.5 in < 5 minutes")
|
||||
print()
|
||||
|
||||
start_time = time.time()
|
||||
losses = []
|
||||
|
||||
for step in range(max_steps):
|
||||
# Sample random pattern
|
||||
tokens = train_data[np.random.randint(len(train_data))]
|
||||
|
||||
# Input: all but last
|
||||
# Target: all but first (shifted by 1)
|
||||
input_seq = tokens[:-1]
|
||||
target_seq = tokens[1:]
|
||||
|
||||
# Convert to tensors
|
||||
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
|
||||
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
|
||||
|
||||
# Forward pass
|
||||
logits = model.forward(x)
|
||||
|
||||
# Compute loss
|
||||
batch_size, seq_len, vocab_size_out = logits.shape
|
||||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size_out)
|
||||
targets_flat = y_true.reshape(batch_size * seq_len)
|
||||
loss = loss_fn.forward(logits_flat, targets_flat)
|
||||
|
||||
# Backward pass
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
# Clip gradients
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
np.clip(param.grad, -1.0, 1.0, out=param.grad)
|
||||
|
||||
# Update
|
||||
optimizer.step()
|
||||
|
||||
losses.append(loss.data.item())
|
||||
|
||||
# Progress every 50 steps
|
||||
if step % 50 == 0 or step == max_steps - 1:
|
||||
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
elapsed = time.time() - start_time
|
||||
print(f"Step {step:4d}/{max_steps} | Loss: {avg_loss:.4f} | Time: {elapsed:.1f}s")
|
||||
|
||||
# Early stopping
|
||||
if avg_loss < 0.5:
|
||||
print(f"\n✓ Target reached! Loss < 0.5 at step {step}")
|
||||
break
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
final_loss = np.mean(losses[-100:])
|
||||
initial_loss = np.mean(losses[:10])
|
||||
improvement = (1 - final_loss / initial_loss) * 100
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("TRAINING COMPLETE")
|
||||
print("=" * 70)
|
||||
print(f"Time: {elapsed:.1f} seconds")
|
||||
print(f"Initial loss: {initial_loss:.4f}")
|
||||
print(f"Final loss: {final_loss:.4f}")
|
||||
print(f"Improvement: {improvement:.1f}%")
|
||||
print()
|
||||
|
||||
return losses
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Testing
|
||||
# ============================================================================
|
||||
|
||||
def test_patterns(model, test_patterns, char_to_idx, idx_to_char, max_len=16):
|
||||
"""
|
||||
Test if model can complete patterns.
|
||||
"""
|
||||
print("=" * 70)
|
||||
print("TESTING LEVEL 2: PATTERN COMPLETION")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
correct = 0
|
||||
total = len(test_patterns)
|
||||
|
||||
for input_str, expected_target in test_patterns:
|
||||
# Encode input + EOS
|
||||
input_tokens = [char_to_idx.get(c, 0) for c in input_str]
|
||||
input_tokens.append(1) # EOS
|
||||
|
||||
# Pad to max_len-1 (leave room for generation)
|
||||
while len(input_tokens) < max_len - 1:
|
||||
input_tokens.append(0)
|
||||
input_tokens = input_tokens[:max_len-1]
|
||||
|
||||
# Forward pass
|
||||
x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
|
||||
logits = model.forward(x)
|
||||
|
||||
# Get prediction for next token (after input + EOS)
|
||||
input_len = len([c for c in input_str]) + 1 # +1 for EOS
|
||||
if input_len < len(input_tokens):
|
||||
next_token_logits = logits.data[0, input_len - 1, :] # Predict position after EOS
|
||||
predicted_token = int(np.argmax(next_token_logits))
|
||||
|
||||
# Decode
|
||||
predicted_char = idx_to_char.get(predicted_token, '?')
|
||||
|
||||
# Check if correct (compare first character of target)
|
||||
expected_first_char = expected_target[0] if len(expected_target) > 0 else ''
|
||||
match = (predicted_char == expected_first_char)
|
||||
else:
|
||||
match = False
|
||||
predicted_char = '?'
|
||||
|
||||
if match:
|
||||
correct += 1
|
||||
status = "✓"
|
||||
else:
|
||||
status = "✗"
|
||||
|
||||
print(f"{status} Input: \"{input_str:12s}\" → Expected: \"{expected_target:6s}\" | Got: \"{predicted_char}\"")
|
||||
|
||||
accuracy = (correct / total) * 100
|
||||
print()
|
||||
print(f"Accuracy: {correct}/{total} ({accuracy:.1f}%)")
|
||||
print()
|
||||
|
||||
if accuracy >= 70:
|
||||
print("✓ LEVEL 2 PASSED: Transformer can complete patterns!")
|
||||
else:
|
||||
print("✗ LEVEL 2 FAILED: Needs more training")
|
||||
|
||||
return accuracy
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("MILESTONE 05 - LEVEL 2: TRANSFORMER PATTERN COMPLETION")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print("Goal: Train transformer to complete patterns in < 5 minutes")
|
||||
print()
|
||||
|
||||
# Create dataset
|
||||
patterns = create_pattern_dataset()
|
||||
char_to_idx, idx_to_char = create_tokenizer(patterns)
|
||||
vocab_size = len(idx_to_char)
|
||||
|
||||
print(f"Dataset: {len(patterns)} patterns")
|
||||
print(f"Vocabulary: {vocab_size} tokens")
|
||||
print(f"Example: \"{patterns[0][0]}\" → \"{patterns[0][1]}\"")
|
||||
print()
|
||||
|
||||
# Encode all patterns
|
||||
max_len = 16
|
||||
train_data = [encode_pattern(inp, out, char_to_idx, max_len) for inp, out in patterns]
|
||||
|
||||
# Create small model (bigger than Level 1)
|
||||
config = {
|
||||
'vocab_size': vocab_size,
|
||||
'embed_dim': 24, # Slightly bigger
|
||||
'num_layers': 2, # 2 layers
|
||||
'num_heads': 2, # 2 heads
|
||||
'max_seq_len': max_len,
|
||||
}
|
||||
|
||||
print("Model configuration:")
|
||||
for key, val in config.items():
|
||||
print(f" {key}: {val}")
|
||||
print()
|
||||
|
||||
model = GPT(**config)
|
||||
num_params = sum(np.prod(p.shape) for p in model.parameters())
|
||||
print(f"Parameters: {num_params:,}")
|
||||
print()
|
||||
|
||||
# Optimizer and loss
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Train
|
||||
print("Starting training...")
|
||||
print()
|
||||
losses = train_patterns(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
loss_fn=loss_fn,
|
||||
train_data=train_data,
|
||||
vocab_size=vocab_size,
|
||||
max_steps=400
|
||||
)
|
||||
|
||||
# Test
|
||||
print("Starting testing...")
|
||||
print()
|
||||
accuracy = test_patterns(model, patterns, char_to_idx, idx_to_char, max_len)
|
||||
|
||||
# Summary
|
||||
print("=" * 70)
|
||||
print("LEVEL 2 SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"✓ Training: {len(losses)} steps")
|
||||
print(f"✓ Loss: {np.mean(losses[:10]):.4f} → {np.mean(losses[-100:]):.4f}")
|
||||
print(f"✓ Accuracy: {accuracy:.1f}%")
|
||||
print()
|
||||
|
||||
if accuracy >= 70:
|
||||
print("🎉 LEVEL 2 COMPLETE! Ready for Level 3: Text Generation")
|
||||
else:
|
||||
print("⚠️ LEVEL 2 INCOMPLETE: Needs more training")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
"""
|
||||
Simple GPT model for CodeBot milestone - bypasses LayerNorm gradient bug.
|
||||
|
||||
This is a workaround for the milestone until core Tensor operations
|
||||
(subtraction, mean) are fixed to maintain gradient flow.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.layers import Linear
|
||||
from tinytorch.core.attention import MultiHeadAttention
|
||||
from tinytorch.core.activations import GELU
|
||||
from tinytorch.text.embeddings import Embedding
|
||||
|
||||
|
||||
class SimpleGPT:
|
||||
"""
|
||||
Simplified GPT without LayerNorm (workaround for gradient flow bugs).
|
||||
|
||||
Architecture:
|
||||
- Token + Position embeddings
|
||||
- N transformer blocks (attention + MLP, NO LayerNorm)
|
||||
- Output projection to vocabulary
|
||||
|
||||
Note: This is a temporary solution for the milestone. The full GPT
|
||||
with LayerNorm requires fixes to core Tensor subtraction/mean operations.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int,
|
||||
embed_dim: int,
|
||||
num_layers: int,
|
||||
num_heads: int,
|
||||
max_seq_len: int,
|
||||
mlp_ratio: int = 4
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_layers = num_layers
|
||||
self.num_heads = num_heads
|
||||
self.max_seq_len = max_seq_len
|
||||
|
||||
# Embeddings
|
||||
self.token_embedding = Embedding(vocab_size, embed_dim)
|
||||
self.position_embedding = Embedding(max_seq_len, embed_dim)
|
||||
|
||||
# Transformer blocks (simplified - no LayerNorm)
|
||||
self.blocks = []
|
||||
for _ in range(num_layers):
|
||||
block = {
|
||||
'attention': MultiHeadAttention(embed_dim, num_heads),
|
||||
'mlp_fc1': Linear(embed_dim, embed_dim * mlp_ratio),
|
||||
'mlp_gelu': GELU(), # Use tinytorch's GELU
|
||||
'mlp_fc2': Linear(embed_dim * mlp_ratio, embed_dim),
|
||||
}
|
||||
self.blocks.append(block)
|
||||
|
||||
# Output projection
|
||||
self.lm_head = Linear(embed_dim, vocab_size)
|
||||
|
||||
def forward(self, tokens: Tensor) -> Tensor:
|
||||
"""
|
||||
Forward pass through simplified GPT.
|
||||
|
||||
Args:
|
||||
tokens: Token indices, shape (batch_size, seq_len)
|
||||
|
||||
Returns:
|
||||
logits: Predictions, shape (batch_size, seq_len, vocab_size)
|
||||
"""
|
||||
batch_size, seq_len = tokens.shape
|
||||
|
||||
# Embeddings
|
||||
token_emb = self.token_embedding.forward(tokens)
|
||||
positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
|
||||
pos_emb = self.position_embedding.forward(positions)
|
||||
x = token_emb + pos_emb # (batch, seq, embed)
|
||||
|
||||
# Transformer blocks
|
||||
for block in self.blocks:
|
||||
# Self-attention with residual
|
||||
attn_out = block['attention'].forward(x)
|
||||
x = x + attn_out # Residual connection
|
||||
|
||||
# MLP with residual
|
||||
mlp_out = block['mlp_fc1'].forward(x)
|
||||
mlp_out = block['mlp_gelu'].forward(mlp_out) # Activation
|
||||
mlp_out = block['mlp_fc2'].forward(mlp_out)
|
||||
x = x + mlp_out # Residual connection
|
||||
|
||||
# Project to vocabulary
|
||||
logits = self.lm_head.forward(x)
|
||||
return logits
|
||||
|
||||
def parameters(self):
|
||||
"""Return all trainable parameters."""
|
||||
params = []
|
||||
params.extend(self.token_embedding.parameters())
|
||||
params.extend(self.position_embedding.parameters())
|
||||
|
||||
for block in self.blocks:
|
||||
params.extend(block['attention'].parameters())
|
||||
params.extend(block['mlp_fc1'].parameters())
|
||||
params.extend(block['mlp_fc2'].parameters())
|
||||
|
||||
params.extend(self.lm_head.parameters())
|
||||
return params
|
||||
|
||||
@@ -1,316 +0,0 @@
|
||||
"""
|
||||
Milestone 05 - 5-Minute Training Test
|
||||
======================================
|
||||
|
||||
GOAL: Train the best possible transformer in exactly 5 minutes.
|
||||
|
||||
We'll optimize for:
|
||||
- Maximum learning in 5 minutes
|
||||
- Clear progress visualization
|
||||
- Actual generation testing
|
||||
- Student-friendly output
|
||||
|
||||
This will show what's realistically achievable in a classroom demo.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.models.transformer import GPT
|
||||
|
||||
enable_autograd()
|
||||
|
||||
# ============================================================================
|
||||
# Dataset: Mix of memorization + patterns
|
||||
# ============================================================================
|
||||
|
||||
def create_dataset():
|
||||
"""Create a diverse but simple dataset."""
|
||||
sequences = [
|
||||
# Easy memorization
|
||||
"AAAA", "BBBB", "CCCC", "1111", "2222",
|
||||
# Simple sequences
|
||||
"ABCD", "EFGH", "IJKL", "MNOP", "QRST",
|
||||
"1234", "5678", "9012",
|
||||
# Patterns (with repetition for learning)
|
||||
"AB", "CD", "EF", "GH",
|
||||
"12", "34", "56", "78",
|
||||
] * 3 # Triple the dataset for better learning
|
||||
return sequences
|
||||
|
||||
|
||||
def create_tokenizer(sequences):
|
||||
"""Simple character tokenizer."""
|
||||
all_chars = sorted(set(''.join(sequences)))
|
||||
char_to_idx = {char: idx + 1 for idx, char in enumerate(all_chars)}
|
||||
idx_to_char = {idx + 1: char for idx, char in enumerate(all_chars)}
|
||||
char_to_idx['<PAD>'] = 0
|
||||
idx_to_char[0] = '<PAD>'
|
||||
return char_to_idx, idx_to_char
|
||||
|
||||
|
||||
def encode(seq, char_to_idx, max_len=10):
|
||||
"""Encode and pad sequence."""
|
||||
tokens = [char_to_idx.get(c, 0) for c in seq]
|
||||
if len(tokens) < max_len:
|
||||
tokens = tokens + [0] * (max_len - len(tokens))
|
||||
else:
|
||||
tokens = tokens[:max_len]
|
||||
return tokens
|
||||
|
||||
|
||||
def decode(tokens, idx_to_char):
|
||||
"""Decode tokens to string."""
|
||||
return ''.join([idx_to_char.get(t, '') for t in tokens if t != 0])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Training with 5-minute time limit
|
||||
# ============================================================================
|
||||
|
||||
def train_5_minutes(model, optimizer, loss_fn, train_data, max_time_seconds=300):
|
||||
"""
|
||||
Train for exactly 5 minutes, show progress throughout.
|
||||
"""
|
||||
print("=" * 70)
|
||||
print("TRAINING FOR 5 MINUTES")
|
||||
print("=" * 70)
|
||||
print(f"Dataset: {len(train_data)} sequences")
|
||||
print(f"Time limit: {max_time_seconds}s ({max_time_seconds/60:.1f} minutes)")
|
||||
print()
|
||||
|
||||
start_time = time.time()
|
||||
losses = []
|
||||
step = 0
|
||||
|
||||
# Progress checkpoints at 1, 2, 3, 4, 5 minutes
|
||||
checkpoints = [60, 120, 180, 240, 300]
|
||||
checkpoint_idx = 0
|
||||
|
||||
print("Training started...")
|
||||
print()
|
||||
|
||||
while True:
|
||||
# Check time limit
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed >= max_time_seconds:
|
||||
break
|
||||
|
||||
# Sample random sequence
|
||||
tokens = train_data[np.random.randint(len(train_data))]
|
||||
|
||||
# Next token prediction
|
||||
input_seq = tokens[:-1]
|
||||
target_seq = tokens[1:]
|
||||
|
||||
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
|
||||
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
|
||||
|
||||
# Forward
|
||||
logits = model.forward(x)
|
||||
|
||||
# Loss
|
||||
batch_size, seq_len, vocab_size = logits.shape
|
||||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
|
||||
targets_flat = y_true.reshape(batch_size * seq_len)
|
||||
loss = loss_fn.forward(logits_flat, targets_flat)
|
||||
|
||||
# Backward
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
# Clip gradients
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
np.clip(param.grad, -1.0, 1.0, out=param.grad)
|
||||
|
||||
# Update
|
||||
optimizer.step()
|
||||
|
||||
losses.append(loss.data.item())
|
||||
step += 1
|
||||
|
||||
# Show progress at checkpoints
|
||||
if checkpoint_idx < len(checkpoints) and elapsed >= checkpoints[checkpoint_idx]:
|
||||
avg_loss = np.mean(losses[-50:]) if len(losses) >= 50 else np.mean(losses)
|
||||
steps_per_sec = step / elapsed
|
||||
print(f"[{int(elapsed):3d}s] Step {step:4d} | Loss: {avg_loss:.4f} | Speed: {steps_per_sec:.2f} steps/sec")
|
||||
checkpoint_idx += 1
|
||||
|
||||
# Also show every 50 steps if we're going fast
|
||||
if step % 50 == 0:
|
||||
if checkpoint_idx == 0 or elapsed < checkpoints[0]: # Only if we haven't hit first checkpoint
|
||||
avg_loss = np.mean(losses[-50:]) if len(losses) >= 50 else np.mean(losses)
|
||||
print(f"[{int(elapsed):3d}s] Step {step:4d} | Loss: {avg_loss:.4f}")
|
||||
|
||||
final_elapsed = time.time() - start_time
|
||||
final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
initial_loss = np.mean(losses[:10])
|
||||
improvement = (1 - final_loss / initial_loss) * 100
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("TRAINING COMPLETE")
|
||||
print("=" * 70)
|
||||
print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.2f} minutes)")
|
||||
print(f"Total steps: {step}")
|
||||
print(f"Steps/second: {step/final_elapsed:.2f}")
|
||||
print(f"Initial loss: {initial_loss:.4f}")
|
||||
print(f"Final loss: {final_loss:.4f}")
|
||||
print(f"Improvement: {improvement:.1f}%")
|
||||
print()
|
||||
|
||||
return losses, step
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Testing
|
||||
# ============================================================================
|
||||
|
||||
def test_generation(model, test_sequences, char_to_idx, idx_to_char):
|
||||
"""Test generation quality."""
|
||||
print("=" * 70)
|
||||
print("TESTING GENERATION")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
correct = 0
|
||||
total = len(test_sequences)
|
||||
|
||||
for seq in test_sequences[:15]: # Test first 15
|
||||
tokens = encode(seq, char_to_idx, max_len=10)
|
||||
|
||||
# Get predictions
|
||||
x = Tensor(np.array([tokens[:-1]], dtype=np.int32), requires_grad=False)
|
||||
logits = model.forward(x)
|
||||
|
||||
# Predict each position
|
||||
predicted_tokens = []
|
||||
for i in range(logits.shape[1]):
|
||||
pred = int(np.argmax(logits.data[0, i, :]))
|
||||
predicted_tokens.append(pred)
|
||||
|
||||
# Compare
|
||||
expected = tokens[1:]
|
||||
match = all(e == p for e, p in zip(expected, predicted_tokens) if e != 0)
|
||||
|
||||
if match:
|
||||
correct += 1
|
||||
status = "✓"
|
||||
else:
|
||||
status = "✗"
|
||||
|
||||
expected_str = decode(expected, idx_to_char)
|
||||
predicted_str = decode(predicted_tokens, idx_to_char)
|
||||
|
||||
print(f"{status} Input: {seq[:6]:8s} → Expected: {expected_str:8s} | Got: {predicted_str:8s}")
|
||||
|
||||
accuracy = (correct / 15) * 100 # Out of 15 tested
|
||||
print()
|
||||
print(f"Accuracy: {correct}/15 ({accuracy:.1f}%)")
|
||||
print()
|
||||
|
||||
return accuracy
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("MILESTONE 05 - 5-MINUTE TRAINING TEST")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print("Let's find out what we can learn in exactly 5 minutes!")
|
||||
print()
|
||||
|
||||
# Dataset
|
||||
sequences = create_dataset()
|
||||
char_to_idx, idx_to_char = create_tokenizer(sequences)
|
||||
vocab_size = len(idx_to_char)
|
||||
|
||||
print(f"Dataset: {len(sequences)} sequences (with repetition)")
|
||||
print(f"Unique sequences: {len(set(sequences))}")
|
||||
print(f"Vocabulary: {vocab_size} tokens")
|
||||
print()
|
||||
|
||||
# Encode
|
||||
train_data = [encode(seq, char_to_idx, max_len=10) for seq in sequences]
|
||||
|
||||
# Model: Ultra-tiny for maximum steps in 5 mins
|
||||
# Goal: <1s per step → ~300+ steps in 5 mins
|
||||
# Strategy: Minimize params for speed
|
||||
config = {
|
||||
'vocab_size': vocab_size,
|
||||
'embed_dim': 16, # Very small
|
||||
'num_layers': 1, # Just 1 layer!
|
||||
'num_heads': 2, # 2 heads
|
||||
'max_seq_len': 10,
|
||||
}
|
||||
|
||||
print("Model configuration:")
|
||||
for key, val in config.items():
|
||||
print(f" {key}: {val}")
|
||||
print()
|
||||
|
||||
model = GPT(**config)
|
||||
num_params = sum(np.prod(p.shape) for p in model.parameters())
|
||||
print(f"Parameters: {num_params:,}")
|
||||
print()
|
||||
|
||||
# Optimizer
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Train for 5 minutes
|
||||
print("Starting 5-minute training run...")
|
||||
print("(Progress will be shown every minute)")
|
||||
print()
|
||||
|
||||
losses, total_steps = train_5_minutes(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
loss_fn=loss_fn,
|
||||
train_data=train_data,
|
||||
max_time_seconds=300 # 5 minutes
|
||||
)
|
||||
|
||||
# Test
|
||||
print("Testing what the model learned...")
|
||||
print()
|
||||
accuracy = test_generation(model, sequences, char_to_idx, idx_to_char)
|
||||
|
||||
# Final summary
|
||||
print("=" * 70)
|
||||
print("5-MINUTE TRAINING SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"✓ Model: {num_params:,} parameters")
|
||||
print(f"✓ Steps completed: {total_steps}")
|
||||
print(f"✓ Loss: {np.mean(losses[:10]):.4f} → {np.mean(losses[-100:]):.4f}")
|
||||
print(f"✓ Improvement: {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}%")
|
||||
print(f"✓ Accuracy: {accuracy:.1f}%")
|
||||
print()
|
||||
|
||||
if accuracy >= 60:
|
||||
print("🎉 EXCELLENT! Model learned well in 5 minutes!")
|
||||
elif accuracy >= 40:
|
||||
print("✓ GOOD! Model is learning, could use more training.")
|
||||
elif accuracy >= 20:
|
||||
print("⚠️ FAIR: Model is learning but needs optimization.")
|
||||
else:
|
||||
print("⚠️ Model needs more training time or tuning.")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,744 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Progressive Test Suite for TinyGPT Learning
|
||||
|
||||
Tests transformer learning from absolute simplest to complex:
|
||||
0. Memorize single sequence (MUST work)
|
||||
1. Pattern completion (A B A → B)
|
||||
2. Copy task (COPY: X → X)
|
||||
3. Simple arithmetic (2+3 → 5)
|
||||
4. TinyTalks greetings
|
||||
|
||||
This helps identify EXACTLY where learning breaks down.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(project_root)
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def run_test_0_memorize_sequence():
|
||||
"""
|
||||
TEST 0: Memorize Single Sequence
|
||||
|
||||
The ABSOLUTE simplest test. Can the model memorize ONE sequence?
|
||||
"HELLO WORLD" repeated many times.
|
||||
|
||||
If this fails, there's a fundamental bug in:
|
||||
- Forward pass
|
||||
- Loss computation
|
||||
- Backward pass
|
||||
- Parameter updates
|
||||
"""
|
||||
console.print("\n" + "=" * 70)
|
||||
console.print("[bold cyan]TEST 0: Single Sequence Memorization[/bold cyan]")
|
||||
console.print("=" * 70)
|
||||
console.print("Task: Memorize 'HELLO WORLD' (repeated 100 times)")
|
||||
console.print("Expected: Loss should drop to near 0")
|
||||
console.print("Why: If this fails, autograd/optimizer is broken\n")
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.text.tokenization import CharTokenizer
|
||||
from tinytorch.text.embeddings import Embedding, PositionalEncoding
|
||||
from tinytorch.models.transformer import TransformerBlock, LayerNorm
|
||||
from tinytorch.core.layers import Linear
|
||||
|
||||
enable_autograd()
|
||||
|
||||
# Super simple data: just repeat "HELLO WORLD"
|
||||
text = "HELLO WORLD " * 100
|
||||
|
||||
# Tokenize
|
||||
tokenizer = CharTokenizer()
|
||||
tokenizer.build_vocab([text])
|
||||
data = tokenizer.encode(text)
|
||||
|
||||
console.print(f"Data length: {len(data)} tokens")
|
||||
console.print(f"Vocabulary: {tokenizer.vocab_size} chars")
|
||||
console.print(f"Unique text: '{text[:50]}...'\n")
|
||||
|
||||
# Tiny model
|
||||
vocab_size = tokenizer.vocab_size
|
||||
embed_dim = 32
|
||||
seq_len = 16
|
||||
|
||||
# Build minimal model
|
||||
embedding = Embedding(vocab_size, embed_dim)
|
||||
pos_enc = PositionalEncoding(seq_len, embed_dim)
|
||||
transformer = TransformerBlock(embed_dim, num_heads=2, mlp_ratio=2, dropout_prob=0.1)
|
||||
ln = LayerNorm(embed_dim)
|
||||
output_proj = Linear(embed_dim, vocab_size)
|
||||
|
||||
params = []
|
||||
params.extend(embedding.parameters())
|
||||
params.extend(pos_enc.parameters())
|
||||
params.extend(transformer.parameters())
|
||||
params.extend(ln.parameters())
|
||||
params.extend(output_proj.parameters())
|
||||
|
||||
for p in params:
|
||||
p.requires_grad = True
|
||||
|
||||
console.print(f"Model: {len(params)} parameter tensors")
|
||||
console.print(f"Embed dim: {embed_dim}, Seq len: {seq_len}\n")
|
||||
|
||||
# Train
|
||||
optimizer = Adam(params, lr=0.01)
|
||||
criterion = CrossEntropyLoss()
|
||||
|
||||
console.print("[yellow]Training (10 steps)...[/yellow]")
|
||||
console.print("[dim]Watching for: loss decrease, gradient flow, parameter updates[/dim]\n")
|
||||
|
||||
initial_loss = None
|
||||
final_loss = None
|
||||
|
||||
for step in range(10):
|
||||
# Random sequence
|
||||
start = np.random.randint(0, len(data) - seq_len - 1)
|
||||
input_seq = data[start:start+seq_len]
|
||||
target_seq = data[start+1:start+seq_len+1]
|
||||
|
||||
console.print(f"[dim]Step {step+1}:[/dim]", end=" ")
|
||||
|
||||
# Forward
|
||||
x = Tensor(np.array([input_seq]))
|
||||
y = Tensor(np.array([target_seq]))
|
||||
|
||||
console.print(f"input shape={x.shape}", end=" ")
|
||||
|
||||
# Through model
|
||||
x = embedding(x)
|
||||
console.print(f"embed_out={x.shape}", end=" ")
|
||||
|
||||
x = pos_enc(x)
|
||||
console.print(f"pos_out={x.shape}", end=" ")
|
||||
|
||||
x = transformer(x)
|
||||
console.print(f"trans_out={x.shape}", end=" ")
|
||||
|
||||
x = ln(x)
|
||||
console.print(f"ln_out={x.shape}", end=" ")
|
||||
|
||||
# Reshape
|
||||
batch, seq, dim = x.shape
|
||||
x_2d = x.reshape(batch * seq, dim)
|
||||
logits_2d = output_proj(x_2d)
|
||||
logits = logits_2d.reshape(batch, seq, vocab_size)
|
||||
|
||||
console.print(f"logits={logits.shape}", end=" ")
|
||||
|
||||
# Loss
|
||||
logits_flat = logits.reshape(batch * seq, vocab_size)
|
||||
targets_flat = y.reshape(-1)
|
||||
|
||||
console.print(f"logits_flat={logits_flat.shape} targets_flat={targets_flat.shape}", end=" ")
|
||||
|
||||
loss = criterion(logits_flat, targets_flat)
|
||||
|
||||
loss_val = float(loss.data)
|
||||
console.print(f"loss={loss_val:.4f}", end=" ")
|
||||
|
||||
# Check if loss has grad_fn
|
||||
has_grad_fn = hasattr(loss, '_grad_fn') and loss._grad_fn is not None
|
||||
console.print(f"has_grad_fn={has_grad_fn}", end=" ")
|
||||
|
||||
# Backward
|
||||
optimizer.zero_grad()
|
||||
|
||||
console.print("backward...", end=" ")
|
||||
loss.backward()
|
||||
|
||||
# Check if params got gradients
|
||||
params_with_grad = sum(1 for p in params if p.grad is not None and np.any(p.grad != 0))
|
||||
console.print(f"params_w_grad={params_with_grad}/{len(params)}", end=" ")
|
||||
|
||||
optimizer.step()
|
||||
console.print("updated")
|
||||
|
||||
if step == 0:
|
||||
initial_loss = loss_val
|
||||
console.print(f" [yellow]→ Initial loss: {initial_loss:.4f}[/yellow]")
|
||||
if step == 9:
|
||||
final_loss = loss_val
|
||||
|
||||
if step % 2 == 0 and step > 0:
|
||||
console.print(f" [cyan]→ Loss so far: {loss_val:.4f}[/cyan]")
|
||||
|
||||
# Result
|
||||
console.print(f"\n[bold]Results:[/bold]")
|
||||
console.print(f" Initial loss: {initial_loss:.4f}")
|
||||
console.print(f" Final loss: {final_loss:.4f}")
|
||||
console.print(f" Decrease: {initial_loss - final_loss:.4f}")
|
||||
|
||||
if final_loss < initial_loss * 0.8:
|
||||
console.print(f" [green]✓ PASS: Loss decreased significantly[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f" [red]✗ FAIL: Loss didn't decrease enough[/red]")
|
||||
console.print(f" [red]→ Bug in: autograd, optimizer, or forward pass[/red]")
|
||||
return False
|
||||
|
||||
|
||||
def run_test_1_pattern_completion():
|
||||
"""
|
||||
TEST 1: Pattern Completion
|
||||
|
||||
Can it learn: "A B A B A B" → next is "A"
|
||||
"1 2 1 2 1 2" → next is "1"
|
||||
|
||||
Tests: Can model learn simple repeating patterns?
|
||||
"""
|
||||
console.print("\n" + "=" * 70)
|
||||
console.print("[bold cyan]TEST 1: Pattern Completion[/bold cyan]")
|
||||
console.print("=" * 70)
|
||||
console.print("Task: Learn repeating patterns (ABAB... → A, 1212... → 1)")
|
||||
console.print("Expected: Predict next token correctly after training")
|
||||
console.print("Why: Tests if attention can learn simple sequences\n")
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.text.embeddings import Embedding, PositionalEncoding
|
||||
from tinytorch.models.transformer import TransformerBlock, LayerNorm
|
||||
from tinytorch.core.layers import Linear
|
||||
|
||||
# Create pattern data
|
||||
patterns = [
|
||||
"A B A B A B A B A B ",
|
||||
"1 2 1 2 1 2 1 2 1 2 ",
|
||||
"X Y X Y X Y X Y X Y ",
|
||||
]
|
||||
|
||||
text = "".join(patterns * 50) # Repeat 50 times
|
||||
|
||||
console.print(f"Data: {len(text)} chars")
|
||||
console.print(f"Patterns: ABAB, 1212, XYXY")
|
||||
console.print(f"Sample: '{text[:40]}...'\n")
|
||||
|
||||
# Tokenize
|
||||
chars = sorted(set(text))
|
||||
vocab_size = len(chars)
|
||||
char_to_idx = {ch: i for i, ch in enumerate(chars)}
|
||||
idx_to_char = {i: ch for i, ch in enumerate(chars)}
|
||||
data = np.array([char_to_idx[ch] for ch in text])
|
||||
|
||||
console.print(f"Vocab: {vocab_size} chars: {repr(''.join(chars))}\n")
|
||||
|
||||
# Build tiny model
|
||||
embed_dim = 32
|
||||
num_heads = 2
|
||||
seq_len = 8
|
||||
|
||||
embedding = Embedding(vocab_size, embed_dim)
|
||||
pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
|
||||
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
|
||||
ln = LayerNorm(embed_dim)
|
||||
output_proj = Linear(embed_dim, vocab_size)
|
||||
|
||||
params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
|
||||
|
||||
# Set requires_grad
|
||||
for p in params:
|
||||
p.requires_grad = True
|
||||
|
||||
optimizer = Adam(params, lr=0.01)
|
||||
criterion = CrossEntropyLoss()
|
||||
|
||||
console.print(f"[yellow]Training (30 steps on patterns)...[/yellow]")
|
||||
|
||||
initial_loss = None
|
||||
final_loss = None
|
||||
|
||||
for step in range(30):
|
||||
start = np.random.randint(0, len(data) - seq_len - 1)
|
||||
input_seq = data[start:start+seq_len]
|
||||
target_seq = data[start+1:start+seq_len+1]
|
||||
|
||||
x = Tensor(np.array([input_seq]))
|
||||
y = Tensor(np.array([target_seq]))
|
||||
|
||||
x = embedding(x)
|
||||
x = pos_enc(x)
|
||||
x = transformer(x)
|
||||
x = ln(x)
|
||||
|
||||
batch, seq, dim = x.shape
|
||||
x_2d = x.reshape(batch * seq, dim)
|
||||
logits_2d = output_proj(x_2d)
|
||||
logits = logits_2d.reshape(batch, seq, vocab_size)
|
||||
|
||||
logits_flat = logits.reshape(batch * seq, vocab_size)
|
||||
targets_flat = y.reshape(-1)
|
||||
loss = criterion(logits_flat, targets_flat)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_val = float(loss.data)
|
||||
if step == 0:
|
||||
initial_loss = loss_val
|
||||
if step == 29:
|
||||
final_loss = loss_val
|
||||
|
||||
if step % 10 == 0 or step == 29:
|
||||
console.print(f" Step {step+1}: Loss = {loss_val:.4f}")
|
||||
|
||||
decrease = initial_loss - final_loss
|
||||
console.print(f"\n[bold]Results:[/bold]")
|
||||
console.print(f" Initial: {initial_loss:.4f}")
|
||||
console.print(f" Final: {final_loss:.4f}")
|
||||
console.print(f" Decrease: {decrease:.4f}")
|
||||
|
||||
if decrease > 0.5:
|
||||
console.print(f" [green]✓ PASS: Loss decreased significantly[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f" [red]✗ FAIL: Loss didn't decrease enough[/red]")
|
||||
return False
|
||||
|
||||
|
||||
def run_test_2_copy_task():
|
||||
"""
|
||||
TEST 2: Copy Task
|
||||
|
||||
Input: "COPY: hello"
|
||||
Output: "hello"
|
||||
|
||||
Classic transformer test from research papers.
|
||||
"""
|
||||
console.print("\n" + "=" * 70)
|
||||
console.print("[bold cyan]TEST 2: Copy Task[/bold cyan]")
|
||||
console.print("=" * 70)
|
||||
console.print("Task: COPY: X → X (reproduce input)")
|
||||
console.print("Expected: Model learns to copy the input text")
|
||||
console.print("Why: Classic test of attention mechanism\n")
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.text.embeddings import Embedding, PositionalEncoding
|
||||
from tinytorch.models.transformer import TransformerBlock, LayerNorm
|
||||
from tinytorch.core.layers import Linear
|
||||
|
||||
# Create copy task data
|
||||
words = ["hello", "world", "test", "copy", "learn", "task"]
|
||||
examples = []
|
||||
for word in words:
|
||||
examples.append(f"COPY:{word}={word} ")
|
||||
|
||||
text = "".join(examples * 50) # Repeat
|
||||
|
||||
console.print(f"Data: {len(text)} chars")
|
||||
console.print(f"Examples: COPY:hello=hello, COPY:world=world")
|
||||
console.print(f"Sample: '{text[:50]}...'\n")
|
||||
|
||||
# Tokenize
|
||||
chars = sorted(set(text))
|
||||
vocab_size = len(chars)
|
||||
char_to_idx = {ch: i for i, ch in enumerate(chars)}
|
||||
data = np.array([char_to_idx[ch] for ch in text])
|
||||
|
||||
console.print(f"Vocab: {vocab_size} chars\n")
|
||||
|
||||
# Build model
|
||||
embed_dim = 32
|
||||
num_heads = 2
|
||||
seq_len = 16
|
||||
|
||||
embedding = Embedding(vocab_size, embed_dim)
|
||||
pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
|
||||
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
|
||||
ln = LayerNorm(embed_dim)
|
||||
output_proj = Linear(embed_dim, vocab_size)
|
||||
|
||||
params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
|
||||
for p in params:
|
||||
p.requires_grad = True
|
||||
|
||||
optimizer = Adam(params, lr=0.01)
|
||||
criterion = CrossEntropyLoss()
|
||||
|
||||
console.print(f"[yellow]Training (40 steps on copy task)...[/yellow]")
|
||||
|
||||
initial_loss = None
|
||||
final_loss = None
|
||||
|
||||
for step in range(40):
|
||||
start = np.random.randint(0, len(data) - seq_len - 1)
|
||||
input_seq = data[start:start+seq_len]
|
||||
target_seq = data[start+1:start+seq_len+1]
|
||||
|
||||
x = Tensor(np.array([input_seq]))
|
||||
y = Tensor(np.array([target_seq]))
|
||||
|
||||
x = embedding(x)
|
||||
x = pos_enc(x)
|
||||
x = transformer(x)
|
||||
x = ln(x)
|
||||
|
||||
batch, seq, dim = x.shape
|
||||
x_2d = x.reshape(batch * seq, dim)
|
||||
logits_2d = output_proj(x_2d)
|
||||
logits = logits_2d.reshape(batch, seq, vocab_size)
|
||||
|
||||
logits_flat = logits.reshape(batch * seq, vocab_size)
|
||||
targets_flat = y.reshape(-1)
|
||||
loss = criterion(logits_flat, targets_flat)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_val = float(loss.data)
|
||||
if step == 0:
|
||||
initial_loss = loss_val
|
||||
if step == 39:
|
||||
final_loss = loss_val
|
||||
|
||||
if step % 10 == 0 or step == 39:
|
||||
console.print(f" Step {step+1}: Loss = {loss_val:.4f}")
|
||||
|
||||
decrease = initial_loss - final_loss
|
||||
console.print(f"\n[bold]Results:[/bold]")
|
||||
console.print(f" Initial: {initial_loss:.4f}")
|
||||
console.print(f" Final: {final_loss:.4f}")
|
||||
console.print(f" Decrease: {decrease:.4f}")
|
||||
|
||||
if decrease > 0.5:
|
||||
console.print(f" [green]✓ PASS: Loss decreased[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f" [red]✗ FAIL: Loss didn't decrease enough[/red]")
|
||||
return False
|
||||
|
||||
|
||||
def run_test_3_simple_arithmetic():
|
||||
"""
|
||||
TEST 3: Simple Arithmetic
|
||||
|
||||
2+3=5
|
||||
1+1=2
|
||||
5-2=3
|
||||
|
||||
Tests: Can model learn simple rules?
|
||||
"""
|
||||
console.print("\n" + "=" * 70)
|
||||
console.print("[bold cyan]TEST 3: Simple Arithmetic[/bold cyan]")
|
||||
console.print("=" * 70)
|
||||
console.print("Task: 2+3=5, 1+1=2, etc. (single digit)")
|
||||
console.print("Expected: Correct answers after training")
|
||||
console.print("Why: Tests reasoning ability\n")
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.text.embeddings import Embedding, PositionalEncoding
|
||||
from tinytorch.models.transformer import TransformerBlock, LayerNorm
|
||||
from tinytorch.core.layers import Linear
|
||||
|
||||
# Create arithmetic data
|
||||
examples = []
|
||||
for a in range(1, 6):
|
||||
for b in range(1, 6):
|
||||
examples.append(f"{a}+{b}={a+b} ")
|
||||
|
||||
text = "".join(examples * 30) # Repeat
|
||||
|
||||
console.print(f"Data: {len(text)} chars")
|
||||
console.print(f"Examples: 1+1=2, 2+3=5, 4+5=9")
|
||||
console.print(f"Sample: '{text[:40]}...'\n")
|
||||
|
||||
# Tokenize
|
||||
chars = sorted(set(text))
|
||||
vocab_size = len(chars)
|
||||
char_to_idx = {ch: i for i, ch in enumerate(chars)}
|
||||
data = np.array([char_to_idx[ch] for ch in text])
|
||||
|
||||
console.print(f"Vocab: {vocab_size} chars\n")
|
||||
|
||||
# Build model
|
||||
embed_dim = 48
|
||||
num_heads = 3
|
||||
seq_len = 12
|
||||
|
||||
embedding = Embedding(vocab_size, embed_dim)
|
||||
pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
|
||||
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
|
||||
ln = LayerNorm(embed_dim)
|
||||
output_proj = Linear(embed_dim, vocab_size)
|
||||
|
||||
params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
|
||||
for p in params:
|
||||
p.requires_grad = True
|
||||
|
||||
optimizer = Adam(params, lr=0.01)
|
||||
criterion = CrossEntropyLoss()
|
||||
|
||||
console.print(f"[yellow]Training (50 steps on arithmetic)...[/yellow]")
|
||||
|
||||
initial_loss = None
|
||||
final_loss = None
|
||||
|
||||
for step in range(50):
|
||||
start = np.random.randint(0, len(data) - seq_len - 1)
|
||||
input_seq = data[start:start+seq_len]
|
||||
target_seq = data[start+1:start+seq_len+1]
|
||||
|
||||
x = Tensor(np.array([input_seq]))
|
||||
y = Tensor(np.array([target_seq]))
|
||||
|
||||
x = embedding(x)
|
||||
x = pos_enc(x)
|
||||
x = transformer(x)
|
||||
x = ln(x)
|
||||
|
||||
batch, seq, dim = x.shape
|
||||
x_2d = x.reshape(batch * seq, dim)
|
||||
logits_2d = output_proj(x_2d)
|
||||
logits = logits_2d.reshape(batch, seq, vocab_size)
|
||||
|
||||
logits_flat = logits.reshape(batch * seq, vocab_size)
|
||||
targets_flat = y.reshape(-1)
|
||||
loss = criterion(logits_flat, targets_flat)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_val = float(loss.data)
|
||||
if step == 0:
|
||||
initial_loss = loss_val
|
||||
if step == 49:
|
||||
final_loss = loss_val
|
||||
|
||||
if step % 10 == 0 or step == 49:
|
||||
console.print(f" Step {step+1}: Loss = {loss_val:.4f}")
|
||||
|
||||
decrease = initial_loss - final_loss
|
||||
console.print(f"\n[bold]Results:[/bold]")
|
||||
console.print(f" Initial: {initial_loss:.4f}")
|
||||
console.print(f" Final: {final_loss:.4f}")
|
||||
console.print(f" Decrease: {decrease:.4f}")
|
||||
|
||||
if decrease > 0.3:
|
||||
console.print(f" [green]✓ PASS: Loss decreased[/green]")
|
||||
console.print(f" [dim](arithmetic is harder, so lower threshold)[/dim]")
|
||||
return True
|
||||
else:
|
||||
console.print(f" [red]✗ FAIL: Loss didn't decrease enough[/red]")
|
||||
return False
|
||||
|
||||
|
||||
def run_test_4_tinytalks_level1():
|
||||
"""
|
||||
TEST 4: TinyTalks Level 1
|
||||
|
||||
Q: Hello!
|
||||
A: Hi there!
|
||||
|
||||
The actual task we want to solve.
|
||||
"""
|
||||
console.print("\n" + "=" * 70)
|
||||
console.print("[bold cyan]TEST 4: TinyTalks Level 1[/bold cyan]")
|
||||
console.print("=" * 70)
|
||||
console.print("Task: Learn greeting Q&A pairs from TinyTalks")
|
||||
console.print("Expected: Can respond to greetings")
|
||||
console.print("Why: The actual milestone goal\n")
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.text.embeddings import Embedding, PositionalEncoding
|
||||
from tinytorch.models.transformer import TransformerBlock, LayerNorm
|
||||
from tinytorch.core.layers import Linear
|
||||
|
||||
# Load TinyTalks Level 1 data
|
||||
try:
|
||||
with open("datasets/tinytalks/splits/train.txt", "r") as f:
|
||||
full_text = f.read()
|
||||
|
||||
# Heuristic: Level 1 = very short Q&A (< 40 chars each)
|
||||
lines = full_text.split('\n')
|
||||
level_1_text = []
|
||||
for i in range(0, len(lines) - 1, 3): # Q, A, blank
|
||||
if i+1 < len(lines):
|
||||
q_line = lines[i]
|
||||
a_line = lines[i+1]
|
||||
if q_line.startswith('Q:') and a_line.startswith('A:'):
|
||||
if len(q_line) < 40 and len(a_line) < 40:
|
||||
level_1_text.append(q_line + '\n' + a_line + '\n\n')
|
||||
|
||||
if not level_1_text:
|
||||
console.print("[red]No Level 1 data found, using first 10 Q&A[/red]")
|
||||
level_1_text = [full_text[:500]]
|
||||
|
||||
text = "".join(level_1_text[:10]) # First 10 simple Q&A
|
||||
|
||||
console.print(f"Data: {len(text)} chars (Level 1 greetings)")
|
||||
console.print(f"Sample:\n{text[:100]}...\n")
|
||||
|
||||
except FileNotFoundError:
|
||||
console.print("[red]TinyTalks not found, skipping Test 4[/red]")
|
||||
return None
|
||||
|
||||
# Tokenize
|
||||
chars = sorted(set(text))
|
||||
vocab_size = len(chars)
|
||||
char_to_idx = {ch: i for i, ch in enumerate(chars)}
|
||||
data = np.array([char_to_idx[ch] for ch in text])
|
||||
|
||||
console.print(f"Vocab: {vocab_size} chars\n")
|
||||
|
||||
# Build model (slightly larger for Q&A)
|
||||
embed_dim = 64
|
||||
num_heads = 4
|
||||
seq_len = 32
|
||||
|
||||
embedding = Embedding(vocab_size, embed_dim)
|
||||
pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
|
||||
transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
|
||||
ln = LayerNorm(embed_dim)
|
||||
output_proj = Linear(embed_dim, vocab_size)
|
||||
|
||||
params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
|
||||
for p in params:
|
||||
p.requires_grad = True
|
||||
|
||||
optimizer = Adam(params, lr=0.005) # Lower LR for Q&A
|
||||
criterion = CrossEntropyLoss()
|
||||
|
||||
console.print(f"[yellow]Training (100 steps on TinyTalks Level 1)...[/yellow]")
|
||||
|
||||
initial_loss = None
|
||||
final_loss = None
|
||||
|
||||
for step in range(100):
|
||||
if len(data) < seq_len + 1:
|
||||
console.print("[red]Dataset too small[/red]")
|
||||
return None
|
||||
|
||||
start = np.random.randint(0, len(data) - seq_len - 1)
|
||||
input_seq = data[start:start+seq_len]
|
||||
target_seq = data[start+1:start+seq_len+1]
|
||||
|
||||
x = Tensor(np.array([input_seq]))
|
||||
y = Tensor(np.array([target_seq]))
|
||||
|
||||
x = embedding(x)
|
||||
x = pos_enc(x)
|
||||
x = transformer(x)
|
||||
x = ln(x)
|
||||
|
||||
batch, seq, dim = x.shape
|
||||
x_2d = x.reshape(batch * seq, dim)
|
||||
logits_2d = output_proj(x_2d)
|
||||
logits = logits_2d.reshape(batch, seq, vocab_size)
|
||||
|
||||
logits_flat = logits.reshape(batch * seq, vocab_size)
|
||||
targets_flat = y.reshape(-1)
|
||||
loss = criterion(logits_flat, targets_flat)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_val = float(loss.data)
|
||||
if step == 0:
|
||||
initial_loss = loss_val
|
||||
if step == 99:
|
||||
final_loss = loss_val
|
||||
|
||||
if step % 20 == 0 or step == 99:
|
||||
console.print(f" Step {step+1}: Loss = {loss_val:.4f}")
|
||||
|
||||
decrease = initial_loss - final_loss
|
||||
console.print(f"\n[bold]Results:[/bold]")
|
||||
console.print(f" Initial: {initial_loss:.4f}")
|
||||
console.print(f" Final: {final_loss:.4f}")
|
||||
console.print(f" Decrease: {decrease:.4f}")
|
||||
|
||||
if decrease > 0.3:
|
||||
console.print(f" [green]✓ PASS: Model is learning TinyTalks![/green]")
|
||||
console.print(f" [cyan]→ Now train full model with tinytalks_gpt.py[/cyan]")
|
||||
return True
|
||||
else:
|
||||
console.print(f" [yellow]⚠ PARTIAL: Some learning, may need more steps[/yellow]")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all tests in sequence"""
|
||||
console.print("\n")
|
||||
console.print(Panel(
|
||||
"[bold cyan]TinyGPT Learning Diagnostic Suite[/bold cyan]\n\n"
|
||||
"Progressive tests from simplest to complex:\n"
|
||||
" 0. Single sequence memorization (MUST work)\n"
|
||||
" 1. Pattern completion (A B A → B)\n"
|
||||
" 2. Copy task (COPY: X → X)\n"
|
||||
" 3. Simple arithmetic (2+3 → 5)\n"
|
||||
" 4. TinyTalks greetings (Q&A)\n\n"
|
||||
"[yellow]This identifies EXACTLY where learning breaks down[/yellow]",
|
||||
title="🔬 Diagnostic Tests",
|
||||
border_style="cyan",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
results = {}
|
||||
|
||||
# Run tests
|
||||
try:
|
||||
results[0] = run_test_0_memorize_sequence()
|
||||
except Exception as e:
|
||||
console.print(f"\n[red]Test 0 crashed: {str(e)}[/red]")
|
||||
results[0] = False
|
||||
|
||||
# Only run next tests if previous passed
|
||||
if results.get(0):
|
||||
results[1] = run_test_1_pattern_completion()
|
||||
results[2] = run_test_2_copy_task()
|
||||
results[3] = run_test_3_simple_arithmetic()
|
||||
results[4] = run_test_4_tinytalks_level1()
|
||||
|
||||
# Summary
|
||||
console.print("\n" + "=" * 70)
|
||||
console.print("[bold]Test Summary:[/bold]")
|
||||
console.print("=" * 70)
|
||||
|
||||
for test_num, result in results.items():
|
||||
if result is True:
|
||||
console.print(f" Test {test_num}: [green]✓ PASS[/green]")
|
||||
elif result is False:
|
||||
console.print(f" Test {test_num}: [red]✗ FAIL[/red]")
|
||||
else:
|
||||
console.print(f" Test {test_num}: [yellow]○ TODO[/yellow]")
|
||||
|
||||
console.print("\n" + "=" * 70)
|
||||
|
||||
if results.get(0) is False:
|
||||
console.print("[bold red]CRITICAL: Test 0 failed![/bold red]")
|
||||
console.print("The transformer cannot even memorize a single sequence.")
|
||||
console.print("This indicates a fundamental bug in:")
|
||||
console.print(" - Forward pass computation")
|
||||
console.print(" - Autograd backward pass")
|
||||
console.print(" - Optimizer parameter updates")
|
||||
console.print(" - Loss computation")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick diagnostic to test if the model can learn ANY pattern at all.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(project_root)
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.text.tokenization import CharTokenizer
|
||||
|
||||
# Enable autograd
|
||||
enable_autograd()
|
||||
|
||||
# Super simple test: Can the model learn to predict "A" after "Q:"?
|
||||
test_data = """Q: Hello!
|
||||
A: Hi there!
|
||||
|
||||
Q: What is your name?
|
||||
A: I am TinyBot.
|
||||
|
||||
Q: What color is the sky?
|
||||
A: The sky is blue.
|
||||
"""
|
||||
|
||||
print("Testing if model can learn simple patterns...")
|
||||
print(f"Test data: {repr(test_data[:100])}...")
|
||||
|
||||
# Build tokenizer
|
||||
tokenizer = CharTokenizer()
|
||||
tokenizer.build_vocab([test_data])
|
||||
tokens = tokenizer.encode(test_data)
|
||||
|
||||
print(f"Vocabulary size: {tokenizer.vocab_size}")
|
||||
print(f"Total tokens: {len(tokens)}")
|
||||
print(f"First 20 tokens: {tokens[:20]}")
|
||||
print(f"Decoded: {repr(tokenizer.decode(tokens[:20]))}")
|
||||
|
||||
# Check specific patterns
|
||||
q_colon_tokens = tokenizer.encode("Q:")
|
||||
print(f"\n'Q:' tokens: {q_colon_tokens}")
|
||||
print(f"'Q:' decoded: {repr(tokenizer.decode(q_colon_tokens))}")
|
||||
|
||||
a_colon_tokens = tokenizer.encode("A:")
|
||||
print(f"'A:' tokens: {a_colon_tokens}")
|
||||
print(f"'A:' decoded: {repr(tokenizer.decode(a_colon_tokens))}")
|
||||
|
||||
# Find all occurrences of "Q:" followed by space/newline then "A:"
|
||||
print("\nPattern analysis:")
|
||||
text_str = test_data
|
||||
q_count = text_str.count("Q:")
|
||||
a_count = text_str.count("A:")
|
||||
print(f"'Q:' appears: {q_count} times")
|
||||
print(f"'A:' appears: {a_count} times")
|
||||
|
||||
print("\n✅ Tokenizer is working correctly!")
|
||||
print("\nConclusion: The model should be able to learn that 'A:' follows 'Q:'")
|
||||
print("If it's generating garbage, the model is either:")
|
||||
print(" 1. Too small (need more parameters)")
|
||||
print(" 2. Not trained enough (need more epochs)")
|
||||
print(" 3. Learning rate is wrong")
|
||||
print(" 4. Or there's a bug in the training loop")
|
||||
|
||||
@@ -1,604 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TinyStories Text Generation (2017) - Transformer Era
|
||||
====================================================
|
||||
|
||||
📚 HISTORICAL CONTEXT:
|
||||
In 2017, Vaswani et al. published "Attention Is All You Need", showing that
|
||||
attention mechanisms alone (no RNNs!) could achieve state-of-the-art results
|
||||
on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs.
|
||||
|
||||
🎯 WHAT YOU'RE BUILDING:
|
||||
Using YOUR TinyTorch implementations, you'll build a character-level language model
|
||||
that generates simple stories - proving YOUR attention mechanism works!
|
||||
|
||||
TinyStories is MUCH EASIER than Shakespeare:
|
||||
- Simple vocabulary (children's stories vs archaic English)
|
||||
- Clear sentence structure
|
||||
- Designed specifically for small models like ours!
|
||||
- Faster convergence and better results
|
||||
|
||||
✅ REQUIRED MODULES (Run after Module 13):
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
Module 02 (Tensor) : YOUR data structure with autograd
|
||||
Module 03 (Activations) : YOUR ReLU in feed-forward networks
|
||||
Module 04 (Layers) : YOUR Linear layers
|
||||
Module 08 (Optimizers) : YOUR Adam optimizer
|
||||
Module 10 (Tokenization) : YOUR CharTokenizer for text→numbers
|
||||
Module 11 (Embeddings) : YOUR token & positional embeddings
|
||||
Module 12 (Attention) : YOUR multi-head self-attention
|
||||
Module 13 (Transformers) : YOUR LayerNorm + TransformerBlock
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
🏗️ ARCHITECTURE (Character-Level Language Model):
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Output Predictions │
|
||||
│ Character Probabilities (vocab_size) │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Output Projection │
|
||||
│ Module 04: vectors → vocabulary │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Layer Norm │
|
||||
│ Module 13: Final normalization │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
╔══════════════════════════════════════════════════════════════════════════════╗
|
||||
║ Transformer Block × N (Repeat) ║
|
||||
║ ┌────────────────────────────────────────────────────────────────────────┐ ║
|
||||
║ │ Feed Forward Network │ ║
|
||||
║ │ Module 04: Linear → ReLU → Linear │ ║
|
||||
║ └────────────────────────────────────────────────────────────────────────┘ ║
|
||||
║ ▲ ║
|
||||
║ ┌────────────────────────────────────────────────────────────────────────┐ ║
|
||||
║ │ Multi-Head Self-Attention │ ║
|
||||
║ │ Module 12: Query·Key^T·Value across all positions │ ║
|
||||
║ └────────────────────────────────────────────────────────────────────────┘ ║
|
||||
╚══════════════════════════════════════════════════════════════════════════════╝
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Positional Encoding │
|
||||
│ Module 11: Add position information │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Character Embeddings │
|
||||
│ Module 11: chars → embed_dim vectors │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Input Characters │
|
||||
│ "To be or not to be, that is..." │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
📊 EXPECTED PERFORMANCE:
|
||||
- Dataset: ~21MB TinyStories validation set (simple children's stories)
|
||||
- Training time: 30-45 minutes (proper training, faster than Shakespeare!)
|
||||
- Vocabulary: ~90 unique characters (simple English)
|
||||
- Expected: Coherent simple stories with proper grammar
|
||||
- Parameters: ~4.8M (perfect size for this task)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import numpy as np
|
||||
import argparse
|
||||
import time
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
# Add project root to path
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(project_root)
|
||||
|
||||
console = Console()
|
||||
|
||||
# Import TinyTorch components YOU BUILT!
|
||||
from tinytorch.core.tensor import Tensor # Module 02: YOU built this!
|
||||
from tinytorch.core.layers import Linear # Module 04: YOU built this!
|
||||
from tinytorch.core.activations import ReLU, Softmax # Module 03: YOU built this!
|
||||
from tinytorch.core.optimizers import Adam # Module 08: YOU built this!
|
||||
from tinytorch.core.losses import CrossEntropyLoss # Module 04: YOU built this!
|
||||
from tinytorch.text.tokenization import CharTokenizer # Module 10: YOU built this!
|
||||
from tinytorch.text.embeddings import Embedding, PositionalEncoding # Module 11: YOU built this!
|
||||
from tinytorch.core.attention import MultiHeadAttention # Module 12: YOU built this!
|
||||
from tinytorch.models.transformer import LayerNorm, TransformerBlock # Module 13: YOU built this!
|
||||
from tinytorch.data.loader import DataLoader, Dataset # Module 08: YOU built this!
|
||||
|
||||
# Import dataset manager
|
||||
from data_manager import DatasetManager
|
||||
|
||||
|
||||
class TinyStoriesDataset(Dataset):
|
||||
"""
|
||||
Character-level TinyStories dataset using YOUR Dataset interface (Module 08)
|
||||
and YOUR CharTokenizer (Module 10)!
|
||||
|
||||
Tokenizes simple children's stories into characters for language modeling.
|
||||
Much easier to learn than Shakespeare!
|
||||
"""
|
||||
|
||||
def __init__(self, text, seq_length=64):
|
||||
"""
|
||||
Initialize dataset with text and sequence length.
|
||||
|
||||
Args:
|
||||
text: Raw Shakespeare text
|
||||
seq_length: Length of input sequences
|
||||
"""
|
||||
# Use YOUR CharTokenizer from Module 10!
|
||||
self.tokenizer = CharTokenizer()
|
||||
self.tokenizer.build_vocab([text]) # Build vocabulary from Shakespeare corpus
|
||||
self.vocab_size = self.tokenizer.vocab_size
|
||||
|
||||
# Convert text to indices using YOUR tokenizer!
|
||||
self.data = self.tokenizer.encode(text)
|
||||
self.seq_length = seq_length
|
||||
|
||||
# Calculate number of sequences
|
||||
self.num_sequences = len(self.data) - seq_length
|
||||
|
||||
def __getitem__(self, idx):
|
||||
"""Get a single training sequence - YOUR Dataset interface!"""
|
||||
# Input: characters at positions [idx, idx+seq_length)
|
||||
# Target: characters at positions [idx+1, idx+seq_length+1)
|
||||
input_seq = self.data[idx:idx + self.seq_length]
|
||||
target_seq = self.data[idx + 1:idx + self.seq_length + 1]
|
||||
|
||||
return Tensor(np.array(input_seq, dtype=np.int32)), Tensor(np.array(target_seq, dtype=np.int32))
|
||||
|
||||
def __len__(self):
|
||||
"""Return dataset size - YOUR Dataset interface!"""
|
||||
return self.num_sequences
|
||||
|
||||
def decode(self, indices):
|
||||
"""Convert indices back to text using YOUR tokenizer!"""
|
||||
return self.tokenizer.decode(indices)
|
||||
|
||||
|
||||
class TinyGPT:
|
||||
"""
|
||||
Character-level Transformer Language Model using YOUR TinyTorch!
|
||||
|
||||
This architecture is what powers GPT, ChatGPT, and modern LLMs.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers):
|
||||
# Token representation
|
||||
self.embedding = Embedding(vocab_size, embed_dim) # Module 11!
|
||||
self.pos_encoding = PositionalEncoding(max_length, embed_dim) # Module 11!
|
||||
|
||||
# Transformer stack
|
||||
self.layers = []
|
||||
mlp_ratio = 4 # Standard 4x expansion in FFN (embed_dim * 4)
|
||||
for _ in range(num_layers):
|
||||
block = TransformerBlock(embed_dim, num_heads, mlp_ratio) # Module 13!
|
||||
self.layers.append(block)
|
||||
|
||||
# Output head
|
||||
self.layer_norm = LayerNorm(embed_dim) # Module 13!
|
||||
self.output_proj = Linear(embed_dim, vocab_size) # Module 04!
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_layers = num_layers
|
||||
self.num_heads = num_heads
|
||||
|
||||
# Calculate parameters
|
||||
self.total_params = self._count_parameters()
|
||||
|
||||
def _count_parameters(self):
|
||||
"""Count total parameters in model."""
|
||||
count = 0
|
||||
for param in self.parameters():
|
||||
count += param.data.size
|
||||
return count
|
||||
|
||||
def parameters(self):
|
||||
"""Get all trainable parameters from YOUR model."""
|
||||
params = []
|
||||
# Embedding parameters
|
||||
params.extend([self.embedding.weight])
|
||||
params.extend(self.pos_encoding.parameters()) # Add positional encoding params!
|
||||
# Transformer block parameters
|
||||
for layer in self.layers:
|
||||
if hasattr(layer, 'parameters'):
|
||||
if callable(layer.parameters):
|
||||
params.extend(layer.parameters())
|
||||
else:
|
||||
params.extend(layer.parameters)
|
||||
# Output projection parameters
|
||||
params.extend([self.layer_norm.gamma, self.layer_norm.beta])
|
||||
params.extend([self.output_proj.weight, self.output_proj.bias])
|
||||
|
||||
# Ensure all parameters have requires_grad=True
|
||||
for param in params:
|
||||
param.requires_grad = True
|
||||
|
||||
return params
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass through YOUR transformer stack."""
|
||||
# Convert tokens to contextual vectors
|
||||
x = self.embedding.forward(x) # Module 11: char → vectors
|
||||
x = self.pos_encoding.forward(x) # Module 11: add position info
|
||||
|
||||
# Process through transformer layers
|
||||
for layer in self.layers:
|
||||
x = layer.forward(x) # Module 13: Attention → FFN
|
||||
|
||||
# Generate predictions
|
||||
x = self.layer_norm.forward(x) # Module 13: final norm
|
||||
|
||||
# Reshape for Linear layer - KEEP COMPUTATION GRAPH!
|
||||
batch_size, seq_len, embed_dim = x.shape
|
||||
x_2d = x.reshape(batch_size * seq_len, embed_dim) # Use Tensor.reshape()
|
||||
|
||||
# Apply output projection
|
||||
logits_2d = self.output_proj(x_2d) # Module 04: vocab predictions
|
||||
|
||||
# Reshape back - KEEP COMPUTATION GRAPH!
|
||||
logits = logits_2d.reshape(batch_size, seq_len, self.vocab_size) # Use Tensor.reshape()
|
||||
|
||||
return logits
|
||||
|
||||
|
||||
def visualize_transformer():
|
||||
"""Show how transformers process text sequences."""
|
||||
console.print("")
|
||||
console.print(Panel.fit(
|
||||
"[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n"
|
||||
"[yellow]The Problem:[/yellow]\n"
|
||||
"RNNs process sequences one step at a time\n"
|
||||
"Can't parallelize → slow training on long sequences\n"
|
||||
"Struggle with long-range dependencies\n\n"
|
||||
"[green]The Innovation:[/green]\n"
|
||||
"Transformers: Attention mechanisms process ENTIRE sequences in parallel\n"
|
||||
" • Self-attention: Every token attends to every other token\n"
|
||||
" • Multi-head attention: Learn multiple attention patterns\n"
|
||||
" • Positional encoding: Preserve sequence order\n\n"
|
||||
"[bold]Can attention alone match RNN performance?[/bold]",
|
||||
title="🎯 ACT 1: THE CHALLENGE",
|
||||
border_style="cyan",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
console.print("""
|
||||
How YOUR Transformer Sees Text: What It Learns:
|
||||
|
||||
Input: "To be or not to be" Layer 1 (Attention):
|
||||
┌─────────────────────┐ • Each word attends to others
|
||||
│ T o b e o r ... │ • "be" looks at "To", "or", etc.
|
||||
└─────────────────────┘ • Captures dependencies
|
||||
↓
|
||||
Character Embeddings Layer 2-4 (Deep Attention):
|
||||
┌─────────────────────┐ • Builds complex patterns
|
||||
│ 128-dim vectors │ • Grammar, style, meaning
|
||||
│ for each character │ • Shakespeare-specific patterns
|
||||
└─────────────────────┘
|
||||
↓ Output Prediction:
|
||||
Position Encoding "To be or not to be, that is the"
|
||||
┌─────────────────────┐ ↓
|
||||
│ Add positional info │ Next char probabilities:
|
||||
│ (order matters!) │ 't' → 0.85 (highest!)
|
||||
└─────────────────────┘ 'n' → 0.03
|
||||
↓ 'a' → 0.02
|
||||
Transformer Layers ×4 ...
|
||||
┌─────────────────────┐
|
||||
│ Self-Attention │ Key Transformer Insight:
|
||||
│ Feed-Forward │ Unlike RNNs, attention lets each
|
||||
│ Layer Norm │ position look at ALL others
|
||||
└─────────────────────┘ simultaneously - capturing long-range
|
||||
↓ dependencies in O(1) operations!
|
||||
Character Predictions
|
||||
┌─────────────────────┐
|
||||
│ Probability for │
|
||||
│ each next character │
|
||||
└─────────────────────┘
|
||||
""")
|
||||
print("="*70)
|
||||
|
||||
|
||||
def train_tinystories_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.01):
|
||||
"""Train TinyGPT using YOUR complete training system with DataLoader!"""
|
||||
console.print("\n[bold]🚀 Training TinyStories TinyGPT with YOUR TinyTorch![/bold]")
|
||||
console.print(f" Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences")
|
||||
console.print(f" Batch size: [cyan]{train_loader.batch_size}[/cyan]")
|
||||
console.print(f" Learning rate: [cyan]{learning_rate}[/cyan] (1e-2, optimal for 4.8M param model)")
|
||||
console.print(f" YOUR DataLoader (Module 08) handles batching!")
|
||||
console.print(f" YOUR Adam optimizer (Module 08)")
|
||||
console.print(f" YOUR CrossEntropyLoss (Module 04) with autograd!")
|
||||
|
||||
# YOUR optimizer and loss function
|
||||
# Using 1e-2 learning rate (optimal for our 4.8M param model, validated by debug script)
|
||||
# Note: Large models (100M+) use 3e-4, but smaller models need higher LR
|
||||
optimizer = Adam(model.parameters(), lr=learning_rate)
|
||||
loss_fn = CrossEntropyLoss() # YOUR loss function with autograd!
|
||||
|
||||
for epoch in range(epochs):
|
||||
console.print(f"\n [bold]Epoch {epoch+1}/{epochs}:[/bold]")
|
||||
epoch_loss = 0
|
||||
batch_count = 0
|
||||
|
||||
# Use YOUR DataLoader to iterate through batches!
|
||||
for batch_idx, (batch_input, batch_target) in enumerate(train_loader):
|
||||
if batch_idx >= 500: # Training mode - process more batches
|
||||
break
|
||||
|
||||
if batch_idx == 0:
|
||||
console.print(f" [dim]Processing first batch... (this may take a moment)[/dim]")
|
||||
|
||||
# Forward pass with YOUR Transformer
|
||||
logits = model(batch_input) # YOUR attention mechanism!
|
||||
|
||||
# Reshape for loss computation: (batch, seq, vocab) -> (batch*seq, vocab)
|
||||
# IMPORTANT: Use Tensor.reshape() to preserve computation graph!
|
||||
batch_size, seq_length, vocab_size = logits.shape
|
||||
logits_2d = logits.reshape(batch_size * seq_length, vocab_size)
|
||||
targets_1d = batch_target.reshape(-1)
|
||||
|
||||
# Compute loss with YOUR CrossEntropyLoss (connects to autograd!)
|
||||
loss = loss_fn.forward(logits_2d, targets_1d) # Module 04 + Module 05!
|
||||
loss_value = float(loss.data)
|
||||
|
||||
# Backward pass with YOUR autograd
|
||||
optimizer.zero_grad() # Module 08!
|
||||
loss.backward() # Module 05: YOUR autodiff!
|
||||
optimizer.step() # Module 08!
|
||||
|
||||
epoch_loss += loss_value
|
||||
batch_count += 1
|
||||
|
||||
# Progress - show output frequently so user sees continuous training
|
||||
if batch_idx == 0 or (batch_idx + 1) % 10 == 0 or (batch_idx + 1) % 50 == 0:
|
||||
avg_loss = epoch_loss / batch_count
|
||||
console.print(f" Batch {batch_idx+1}/500 | Loss: {loss_value:.4f} | Avg: {avg_loss:.4f}")
|
||||
|
||||
# Epoch summary
|
||||
avg_loss = epoch_loss / max(1, batch_count)
|
||||
console.print(f" → Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)")
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def generate_text(model, dataset, prompt="To be or not", max_length=200, temperature=0.8):
|
||||
"""
|
||||
Generate text from a prompt - THE WOW MOMENT!
|
||||
|
||||
This is autoregressive generation: predict next char, add it, repeat.
|
||||
"""
|
||||
console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]")
|
||||
console.print("="*70)
|
||||
|
||||
# Convert prompt to indices
|
||||
prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx]
|
||||
generated = prompt_indices.copy()
|
||||
|
||||
console.print(f"📝 Prompt: [cyan]\"{prompt}\"[/cyan]")
|
||||
console.print(f"🎯 Generating [cyan]{max_length}[/cyan] characters...\n")
|
||||
|
||||
# Generate character by character
|
||||
for _ in range(max_length):
|
||||
# Take last seq_length characters as input
|
||||
input_seq = generated[-dataset.seq_length:] if len(generated) >= dataset.seq_length else generated
|
||||
|
||||
# Pad if necessary
|
||||
if len(input_seq) < dataset.seq_length:
|
||||
input_seq = [0] * (dataset.seq_length - len(input_seq)) + input_seq
|
||||
|
||||
# Forward pass
|
||||
input_tensor = Tensor(np.array([input_seq], dtype=np.int32))
|
||||
logits = model(input_tensor)
|
||||
|
||||
# Get logits for last position
|
||||
logits_np = np.array(logits.data.data if hasattr(logits.data, 'data') else logits.data)
|
||||
next_logits = logits_np[0, -1, :] # Last position predictions
|
||||
|
||||
# Apply temperature and sample
|
||||
next_logits = next_logits / temperature
|
||||
exp_logits = np.exp(next_logits - np.max(next_logits))
|
||||
probs = exp_logits / np.sum(exp_logits)
|
||||
|
||||
# Sample from distribution
|
||||
next_idx = np.random.choice(len(probs), p=probs)
|
||||
generated.append(next_idx)
|
||||
|
||||
# Decode to text
|
||||
generated_text = dataset.decode(generated)
|
||||
|
||||
console.print("[bold]📖 Generated Text:[/bold]")
|
||||
console.print("─" * 70)
|
||||
console.print(f"[green]{generated_text}[/green]")
|
||||
console.print("─" * 70)
|
||||
|
||||
return generated_text
|
||||
|
||||
|
||||
def analyze_transformer_systems(model):
|
||||
"""Analyze YOUR Transformer from an ML systems perspective."""
|
||||
console.print("")
|
||||
console.print(Panel.fit(
|
||||
f"[bold]Model Architecture:[/bold]\n"
|
||||
f" • Parameters: [cyan]{model.total_params:,}[/cyan] weights\n"
|
||||
f" • Embedding dim: [cyan]{model.embed_dim}[/cyan]\n"
|
||||
f" • Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n"
|
||||
|
||||
"[bold]Computational Complexity:[/bold]\n"
|
||||
" • Attention: O(n²·d) where n=sequence, d=dimension\n"
|
||||
" • Self-attention allows parallel processing (vs RNN sequential)\n"
|
||||
" • YOUR implementation: Pure Python + NumPy\n\n"
|
||||
|
||||
f"[bold]Memory Requirements:[/bold]\n"
|
||||
f" • Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n"
|
||||
" • Attention matrices: O(n²) per layer\n"
|
||||
" • YOUR TinyTorch tracks gradients automatically\n\n"
|
||||
|
||||
"[bold]🏛️ Transformer Evolution:[/bold]\n"
|
||||
" • 2017: Vaswani et al. 'Attention Is All You Need'\n"
|
||||
" • 2018: BERT (bidirectional), GPT (autoregressive)\n"
|
||||
" • 2020: GPT-3 (175B params, same architecture!)\n"
|
||||
" • 2022: ChatGPT (YOUR architecture at massive scale)\n"
|
||||
" • YOUR TinyGPT: Core principles that power them all!\n\n"
|
||||
|
||||
"[bold]💡 Why Transformers Dominate:[/bold]\n"
|
||||
" • Parallelizable (vs sequential RNNs)\n"
|
||||
" • Long-range dependencies (attention sees everything)\n"
|
||||
" • Scalable (architecture works from 1M to 175B params)\n"
|
||||
" • YOUR implementation demonstrates all of these!",
|
||||
|
||||
title="🔬 SYSTEMS ANALYSIS",
|
||||
border_style="cyan",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
|
||||
def main():
|
||||
"""Demonstrate Shakespeare text generation using YOUR TinyTorch!"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Shakespeare Transformer 2017')
|
||||
parser.add_argument('--test-only', action='store_true',
|
||||
help='Test architecture only')
|
||||
parser.add_argument('--epochs', type=int, default=20,
|
||||
help='Training epochs')
|
||||
parser.add_argument('--batch-size', type=int, default=32,
|
||||
help='Batch size')
|
||||
parser.add_argument('--seq-length', type=int, default=128,
|
||||
help='Sequence length')
|
||||
parser.add_argument('--embed-dim', type=int, default=256,
|
||||
help='Embedding dimension')
|
||||
parser.add_argument('--num-layers', type=int, default=6,
|
||||
help='Number of transformer layers')
|
||||
parser.add_argument('--num-heads', type=int, default=8,
|
||||
help='Number of attention heads')
|
||||
parser.add_argument('--visualize', action='store_true', default=True,
|
||||
help='Show transformer visualization')
|
||||
parser.add_argument('--quick-test', action='store_true',
|
||||
help='Use small subset for testing')
|
||||
args = parser.parse_args()
|
||||
|
||||
console.print("")
|
||||
console.print(Panel.fit(
|
||||
"[bold cyan]TinyStories Transformer - Simple Story Generation![/bold cyan]\n\n"
|
||||
"[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n"
|
||||
"[green]YOUR achievement:[/green] Generate coherent children's stories\n"
|
||||
"[cyan]Components used:[/cyan] YOUR complete NLP pipeline (Modules 2, 3, 4, 8, 10, 11, 12, 13)\n"
|
||||
"[dim]Note: TinyStories is much easier than Shakespeare - designed for small models![/dim]",
|
||||
title="🎯 Milestone 05: Transformer Era (2017)",
|
||||
border_style="cyan",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
# Visualization
|
||||
if args.visualize:
|
||||
visualize_transformer()
|
||||
|
||||
# Step 1: Load TinyStories dataset
|
||||
console.print("\n[bold]📥 Loading TinyStories dataset...[/bold]")
|
||||
|
||||
# Load TinyStories from downloaded file
|
||||
tinystories_path = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
'../datasets/tinystories/tinystories_val.txt'
|
||||
)
|
||||
|
||||
if not os.path.exists(tinystories_path):
|
||||
console.print(f"[red]❌ TinyStories not found at {tinystories_path}[/red]")
|
||||
console.print("[yellow]Run: python milestones/05_2017_transformer/download_tinystories.py[/yellow]")
|
||||
return
|
||||
|
||||
with open(tinystories_path, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
|
||||
console.print(f"📊 Loaded: {len(text):,} characters, {len(text.split()):,} words")
|
||||
|
||||
if args.quick_test:
|
||||
text = text[:100000] # Use small subset for testing (100K chars)
|
||||
console.print(" [dim](Using 100K char subset for quick testing)[/dim]")
|
||||
|
||||
# Step 2: Create Dataset and DataLoader using YOUR Module 08!
|
||||
console.print(f"\n[bold]📦 Creating YOUR Dataset and DataLoader (Module 08)...[/bold]")
|
||||
dataset = TinyStoriesDataset(text, seq_length=args.seq_length)
|
||||
|
||||
# YOUR DataLoader handles batching and shuffling!
|
||||
train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
|
||||
|
||||
console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters")
|
||||
console.print(f" Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]")
|
||||
console.print(f" DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]")
|
||||
|
||||
# Step 3: Build Transformer
|
||||
model = TinyGPT(
|
||||
vocab_size=dataset.vocab_size,
|
||||
embed_dim=args.embed_dim,
|
||||
max_length=args.seq_length,
|
||||
num_heads=args.num_heads,
|
||||
num_layers=args.num_layers
|
||||
)
|
||||
|
||||
# Display model info
|
||||
console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]")
|
||||
console.print(f" Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings")
|
||||
console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters")
|
||||
console.print(f" Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)")
|
||||
|
||||
if args.test_only:
|
||||
console.print("\n[bold yellow]🧪 ARCHITECTURE TEST MODE[/bold yellow]")
|
||||
# Test with minimal data
|
||||
test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32))
|
||||
test_output = model(test_input)
|
||||
console.print(f"[green]✅ Forward pass successful! Output shape: {test_output.data.shape}[/green]")
|
||||
console.print(f"[green]✅ YOUR Transformer + DataLoader work together![/green]")
|
||||
return
|
||||
|
||||
# Step 4: Train using YOUR DataLoader
|
||||
start_time = time.time()
|
||||
model = train_tinystories_gpt(model, train_loader, dataset, epochs=args.epochs)
|
||||
train_time = time.time() - start_time
|
||||
|
||||
# Step 5: Generate text!
|
||||
generated = generate_text(model, dataset, prompt="Once upon a time", max_length=200)
|
||||
|
||||
# Additional generation examples
|
||||
console.print("\n[bold]🎭 More Generation Examples:[/bold]")
|
||||
console.print("─" * 70)
|
||||
|
||||
prompts = ["ROMEO:", "The king", "What is"]
|
||||
for prompt in prompts:
|
||||
if all(ch in dataset.char_to_idx for ch in prompt):
|
||||
console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]")
|
||||
gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8)
|
||||
|
||||
# Step 6: Systems Analysis
|
||||
analyze_transformer_systems(model)
|
||||
|
||||
console.print(f"\n[bold]⏱️ Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds")
|
||||
console.print(f" Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]")
|
||||
|
||||
console.print("")
|
||||
console.print(Panel.fit(
|
||||
"[bold green]✅ SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n"
|
||||
|
||||
"[bold]🎓 What YOU Accomplished:[/bold]\n"
|
||||
" • YOUR attention mechanism processes sequences in parallel\n"
|
||||
" • YOUR transformer captures long-range text dependencies\n"
|
||||
" • YOUR DataLoader efficiently batches character sequences\n"
|
||||
" • YOUR TinyGPT generates coherent text!\n"
|
||||
" • YOUR complete language modeling system works!\n\n"
|
||||
|
||||
"[bold]🚀 Next Steps:[/bold]\n"
|
||||
" • Continue to Module 14 (KV-Caching) for 3x faster inference\n"
|
||||
" • YOUR transformer architecture scales to GPT-scale models\n"
|
||||
" • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!",
|
||||
|
||||
title="🌟 2017 Transformer Revolution Complete",
|
||||
border_style="green",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,375 +0,0 @@
|
||||
"""
|
||||
TinyTalks Chatbot - Train a Simple Conversational AI in 10-15 Minutes
|
||||
======================================================================
|
||||
|
||||
A minimal but functional chatbot trained on simple Q&A pairs.
|
||||
|
||||
Goal: Show that transformers can learn conversational patterns quickly!
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.models.transformer import GPT
|
||||
from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
|
||||
|
||||
enable_autograd()
|
||||
|
||||
# ============================================================================
|
||||
# Tokenization
|
||||
# ============================================================================
|
||||
|
||||
def create_tokenizer(conversations):
|
||||
"""Create character-level tokenizer with special tokens."""
|
||||
# Get all unique characters
|
||||
all_text = ' '.join([q + ' ' + a for q, a in conversations])
|
||||
all_chars = sorted(set(all_text))
|
||||
|
||||
# Special tokens
|
||||
special_tokens = {
|
||||
'<PAD>': 0,
|
||||
'<SOS>': 1, # Start of sequence
|
||||
'<SEP>': 2, # Separator between Q and A
|
||||
'<EOS>': 3, # End of sequence
|
||||
}
|
||||
|
||||
# Character mappings
|
||||
char_to_idx = {**special_tokens}
|
||||
idx_to_char = {v: k for k, v in special_tokens.items()}
|
||||
|
||||
for idx, char in enumerate(all_chars, start=len(special_tokens)):
|
||||
char_to_idx[char] = idx
|
||||
idx_to_char[idx] = char
|
||||
|
||||
return char_to_idx, idx_to_char
|
||||
|
||||
|
||||
def encode_conversation(question, answer, char_to_idx, max_len=80):
|
||||
"""
|
||||
Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>...
|
||||
|
||||
Example:
|
||||
Q: "Hi"
|
||||
A: "Hello"
|
||||
→ [<SOS>, H, i, <SEP>, H, e, l, l, o, <EOS>, <PAD>, ...]
|
||||
"""
|
||||
# Build sequence
|
||||
tokens = [char_to_idx['<SOS>']]
|
||||
|
||||
# Add question
|
||||
for c in question:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
|
||||
# Add separator
|
||||
tokens.append(char_to_idx['<SEP>'])
|
||||
|
||||
# Add answer
|
||||
for c in answer:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
|
||||
# Add EOS
|
||||
tokens.append(char_to_idx['<EOS>'])
|
||||
|
||||
# Pad
|
||||
if len(tokens) < max_len:
|
||||
tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
|
||||
else:
|
||||
tokens = tokens[:max_len]
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def decode_tokens(tokens, idx_to_char, stop_at_eos=True):
|
||||
"""Decode tokens to string."""
|
||||
chars = []
|
||||
for t in tokens:
|
||||
if t == 0: # PAD
|
||||
if stop_at_eos:
|
||||
break
|
||||
elif t == 1: # SOS
|
||||
continue
|
||||
elif t == 2: # SEP
|
||||
chars.append(' | ')
|
||||
elif t == 3: # EOS
|
||||
if stop_at_eos:
|
||||
break
|
||||
else:
|
||||
chars.append(idx_to_char.get(t, '?'))
|
||||
return ''.join(chars)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Training
|
||||
# ============================================================================
|
||||
|
||||
def train_chatbot(model, optimizer, loss_fn, train_data, max_time_minutes=10):
|
||||
"""
|
||||
Train TinyTalks chatbot.
|
||||
"""
|
||||
max_time_seconds = max_time_minutes * 60
|
||||
|
||||
print("=" * 70)
|
||||
print(f"TRAINING TINYTALKS CHATBOT FOR {max_time_minutes} MINUTES")
|
||||
print("=" * 70)
|
||||
print(f"Dataset: {len(train_data)} conversations")
|
||||
print(f"Time limit: {max_time_seconds}s ({max_time_minutes} minutes)")
|
||||
print()
|
||||
|
||||
start_time = time.time()
|
||||
losses = []
|
||||
step = 0
|
||||
|
||||
# Progress checkpoints every 2 minutes
|
||||
checkpoint_interval = 120 # 2 minutes
|
||||
next_checkpoint = checkpoint_interval
|
||||
|
||||
print("Training started...")
|
||||
print()
|
||||
|
||||
while True:
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed >= max_time_seconds:
|
||||
break
|
||||
|
||||
# Sample random conversation
|
||||
tokens = train_data[np.random.randint(len(train_data))]
|
||||
|
||||
# Next token prediction
|
||||
input_seq = tokens[:-1]
|
||||
target_seq = tokens[1:]
|
||||
|
||||
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
|
||||
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
|
||||
|
||||
# Forward
|
||||
logits = model.forward(x)
|
||||
|
||||
# Loss
|
||||
batch_size, seq_len, vocab_size = logits.shape
|
||||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
|
||||
targets_flat = y_true.reshape(batch_size * seq_len)
|
||||
loss = loss_fn.forward(logits_flat, targets_flat)
|
||||
|
||||
# Backward
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
# Clip gradients
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
np.clip(param.grad, -1.0, 1.0, out=param.grad)
|
||||
|
||||
# Update
|
||||
optimizer.step()
|
||||
|
||||
losses.append(loss.data.item())
|
||||
step += 1
|
||||
|
||||
# Show progress at checkpoints
|
||||
if elapsed >= next_checkpoint:
|
||||
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
steps_per_sec = step / elapsed
|
||||
mins = int(elapsed / 60)
|
||||
print(f"[{mins:2d} min] Step {step:5d} | Loss: {avg_loss:.4f} | Speed: {steps_per_sec:.1f} steps/sec")
|
||||
next_checkpoint += checkpoint_interval
|
||||
|
||||
# Also show every 500 steps for early progress
|
||||
if step % 500 == 0 and step <= 2000:
|
||||
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
print(f"[{int(elapsed):4d}s] Step {step:5d} | Loss: {avg_loss:.4f}")
|
||||
|
||||
final_elapsed = time.time() - start_time
|
||||
final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
initial_loss = np.mean(losses[:10])
|
||||
improvement = (1 - final_loss / initial_loss) * 100
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("TRAINING COMPLETE")
|
||||
print("=" * 70)
|
||||
print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.1f} minutes)")
|
||||
print(f"Total steps: {step:,}")
|
||||
print(f"Steps/second: {step/final_elapsed:.1f}")
|
||||
print(f"Initial loss: {initial_loss:.4f}")
|
||||
print(f"Final loss: {final_loss:.4f}")
|
||||
print(f"Improvement: {improvement:.1f}%")
|
||||
print()
|
||||
|
||||
return losses, step
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Generation / Chat
|
||||
# ============================================================================
|
||||
|
||||
def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
|
||||
"""
|
||||
Generate response to a question.
|
||||
|
||||
Process:
|
||||
1. Encode: <SOS> question <SEP>
|
||||
2. Generate tokens until <EOS> or max_len
|
||||
3. Decode generated tokens
|
||||
"""
|
||||
# Encode question
|
||||
tokens = [char_to_idx['<SOS>']]
|
||||
for c in question:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
tokens.append(char_to_idx['<SEP>'])
|
||||
|
||||
# Generate response
|
||||
generated_tokens = []
|
||||
for _ in range(max_len):
|
||||
# Pad input to model's expected length
|
||||
input_tokens = tokens + generated_tokens
|
||||
while len(input_tokens) < 80: # Match training max_len
|
||||
input_tokens.append(char_to_idx['<PAD>'])
|
||||
input_tokens = input_tokens[:80]
|
||||
|
||||
# Forward pass
|
||||
x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
|
||||
logits = model.forward(x)
|
||||
|
||||
# Get next token (position after current sequence)
|
||||
next_pos = len(tokens) + len(generated_tokens) - 1
|
||||
if next_pos < logits.shape[1]:
|
||||
next_logits = logits.data[0, next_pos, :]
|
||||
next_token = int(np.argmax(next_logits))
|
||||
|
||||
# Stop at EOS or PAD
|
||||
if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
|
||||
break
|
||||
|
||||
generated_tokens.append(next_token)
|
||||
else:
|
||||
break
|
||||
|
||||
# Decode generated response
|
||||
response = decode_tokens(generated_tokens, idx_to_char, stop_at_eos=False)
|
||||
return response
|
||||
|
||||
|
||||
def test_chatbot(model, test_questions, char_to_idx, idx_to_char):
|
||||
"""Test chatbot on sample questions."""
|
||||
print("=" * 70)
|
||||
print("TESTING CHATBOT")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
for question in test_questions:
|
||||
response = generate_response(model, question, char_to_idx, idx_to_char)
|
||||
print(f"Q: {question}")
|
||||
print(f"A: {response}")
|
||||
print()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("TINYTALKS CHATBOT - 10-15 MINUTE TRAINING")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Load dataset
|
||||
conversations = create_tinytalks_dataset()
|
||||
stats = get_dataset_stats()
|
||||
|
||||
print(f"Dataset: {stats['total_examples']} examples ({stats['unique_examples']} unique)")
|
||||
print(f"Repetition: {stats['repetition_factor']:.1f}x for better learning")
|
||||
print(f"Avg lengths: Q={stats['avg_question_len']:.1f} chars, A={stats['avg_answer_len']:.1f} chars")
|
||||
print()
|
||||
|
||||
# Create tokenizer
|
||||
char_to_idx, idx_to_char = create_tokenizer(conversations)
|
||||
vocab_size = len(idx_to_char)
|
||||
print(f"Vocabulary: {vocab_size} tokens (including special tokens)")
|
||||
print()
|
||||
|
||||
# Encode dataset
|
||||
max_seq_len = 80
|
||||
train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
|
||||
|
||||
# Model: Ultra-tiny for speed (learned from 5-min test!)
|
||||
# Target: ~20-30 steps/sec with longer sequences
|
||||
# In 10 mins (600s): ~12,000-18,000 steps
|
||||
config = {
|
||||
'vocab_size': vocab_size,
|
||||
'embed_dim': 16, # Keep it tiny!
|
||||
'num_layers': 1, # Just 1 layer
|
||||
'num_heads': 2, # 2 heads
|
||||
'max_seq_len': max_seq_len,
|
||||
}
|
||||
|
||||
print("Model configuration:")
|
||||
for key, val in config.items():
|
||||
print(f" {key}: {val}")
|
||||
print()
|
||||
|
||||
model = GPT(**config)
|
||||
num_params = sum(np.prod(p.shape) for p in model.parameters())
|
||||
print(f"Parameters: {num_params:,}")
|
||||
print()
|
||||
|
||||
# Optimizer
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Train for 15 minutes (adjustable)
|
||||
train_time = 15 # minutes
|
||||
print(f"Training for {train_time} minutes...")
|
||||
print()
|
||||
|
||||
losses, total_steps = train_chatbot(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
loss_fn=loss_fn,
|
||||
train_data=train_data,
|
||||
max_time_minutes=train_time
|
||||
)
|
||||
|
||||
# Test with sample questions
|
||||
test_questions = [
|
||||
"Hi",
|
||||
"How are you",
|
||||
"What is your name",
|
||||
"What is the sky",
|
||||
"Is grass green",
|
||||
"What is 1 plus 1",
|
||||
"Are you happy",
|
||||
"Bye",
|
||||
]
|
||||
|
||||
print("Testing chatbot responses...")
|
||||
print()
|
||||
test_chatbot(model, test_questions, char_to_idx, idx_to_char)
|
||||
|
||||
# Summary
|
||||
print("=" * 70)
|
||||
print("TINYTALKS SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"✓ Model: {num_params:,} parameters")
|
||||
print(f"✓ Training: {train_time} minutes, {total_steps:,} steps")
|
||||
print(f"✓ Loss: {np.mean(losses[:10]):.4f} → {np.mean(losses[-100:]):.4f}")
|
||||
print(f"✓ Improvement: {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}%")
|
||||
print()
|
||||
print("Try it yourself:")
|
||||
print(" 1. Ask simple questions from the training set")
|
||||
print(" 2. The model should generate learned responses")
|
||||
print(" 3. Experiment with model size and training time!")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,546 +0,0 @@
|
||||
"""
|
||||
TinyTalks Interactive Dashboard - Watch Learning Happen Live!
|
||||
=============================================================
|
||||
|
||||
A beautiful, educational dashboard showing a transformer learn to chat.
|
||||
|
||||
Students see:
|
||||
- Live training metrics
|
||||
- Responses improving from gibberish to coherent
|
||||
- Real-time checkpoints with before/after comparison
|
||||
- Visual feedback on what's correct vs incorrect
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.models.transformer import GPT
|
||||
from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
|
||||
|
||||
enable_autograd()
|
||||
|
||||
# Rich CLI imports
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.layout import Layout
|
||||
from rich.live import Live
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
|
||||
from rich import box
|
||||
from rich.text import Text
|
||||
|
||||
console = Console()
|
||||
|
||||
# ============================================================================
|
||||
# Tokenization (same as tinytalks_chatbot.py)
|
||||
# ============================================================================
|
||||
|
||||
def create_tokenizer(conversations):
|
||||
"""Create character-level tokenizer with special tokens."""
|
||||
all_text = ' '.join([q + ' ' + a for q, a in conversations])
|
||||
all_chars = sorted(set(all_text))
|
||||
|
||||
special_tokens = {
|
||||
'<PAD>': 0,
|
||||
'<SOS>': 1,
|
||||
'<SEP>': 2,
|
||||
'<EOS>': 3,
|
||||
}
|
||||
|
||||
char_to_idx = {**special_tokens}
|
||||
idx_to_char = {v: k for k, v in special_tokens.items()}
|
||||
|
||||
for idx, char in enumerate(all_chars, start=len(special_tokens)):
|
||||
char_to_idx[char] = idx
|
||||
idx_to_char[idx] = char
|
||||
|
||||
return char_to_idx, idx_to_char
|
||||
|
||||
|
||||
def encode_conversation(question, answer, char_to_idx, max_len=80):
|
||||
"""Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>..."""
|
||||
tokens = [char_to_idx['<SOS>']]
|
||||
|
||||
for c in question:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
|
||||
tokens.append(char_to_idx['<SEP>'])
|
||||
|
||||
for c in answer:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
|
||||
tokens.append(char_to_idx['<EOS>'])
|
||||
|
||||
if len(tokens) < max_len:
|
||||
tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
|
||||
else:
|
||||
tokens = tokens[:max_len]
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def decode_tokens(tokens, idx_to_char):
|
||||
"""Decode tokens to string."""
|
||||
chars = []
|
||||
for t in tokens:
|
||||
if t == 0 or t == 1: # PAD or SOS
|
||||
continue
|
||||
elif t == 2: # SEP
|
||||
continue
|
||||
elif t == 3: # EOS
|
||||
break
|
||||
else:
|
||||
chars.append(idx_to_char.get(t, '?'))
|
||||
return ''.join(chars)
|
||||
|
||||
|
||||
def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
|
||||
"""Generate response to a question."""
|
||||
tokens = [char_to_idx['<SOS>']]
|
||||
for c in question:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
tokens.append(char_to_idx['<SEP>'])
|
||||
|
||||
generated_tokens = []
|
||||
for _ in range(max_len):
|
||||
input_tokens = tokens + generated_tokens
|
||||
while len(input_tokens) < 80:
|
||||
input_tokens.append(char_to_idx['<PAD>'])
|
||||
input_tokens = input_tokens[:80]
|
||||
|
||||
x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
|
||||
logits = model.forward(x)
|
||||
|
||||
next_pos = len(tokens) + len(generated_tokens) - 1
|
||||
if next_pos < logits.shape[1]:
|
||||
next_logits = logits.data[0, next_pos, :]
|
||||
next_token = int(np.argmax(next_logits))
|
||||
|
||||
if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
|
||||
break
|
||||
|
||||
generated_tokens.append(next_token)
|
||||
else:
|
||||
break
|
||||
|
||||
response = decode_tokens(generated_tokens, idx_to_char)
|
||||
return response
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Dashboard Components
|
||||
# ============================================================================
|
||||
|
||||
def create_welcome_panel():
|
||||
"""Create the welcome panel."""
|
||||
return Panel.fit(
|
||||
"[bold cyan]🤖 TINYTALKS - Watch a Transformer Learn to Chat![/bold cyan]\n\n"
|
||||
"[dim]You're about to see AI learning happen in real-time.\n"
|
||||
"The model starts knowing nothing - just random noise.\n"
|
||||
"Every training step makes it slightly smarter.\n"
|
||||
"Watch responses improve from gibberish to coherent conversation![/dim]\n\n"
|
||||
"[bold]Training Duration:[/bold] 10-15 minutes\n"
|
||||
"[bold]Checkpoints:[/bold] Every ~2 minutes\n"
|
||||
"[bold]What to watch:[/bold] Loss ↓ = Better responses ✓",
|
||||
title="🎓 Educational AI Training Demo",
|
||||
border_style="cyan",
|
||||
box=box.DOUBLE
|
||||
)
|
||||
|
||||
|
||||
def create_metrics_table(step, loss, elapsed, steps_per_sec):
|
||||
"""Create current training metrics table."""
|
||||
table = Table(show_header=False, box=box.SIMPLE, padding=(0, 2))
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Value", style="green bold")
|
||||
|
||||
table.add_row("Step", f"{step:,}")
|
||||
table.add_row("Loss", f"{loss:.4f}")
|
||||
table.add_row("Time", f"{int(elapsed/60)}m {int(elapsed%60)}s")
|
||||
table.add_row("Speed", f"{steps_per_sec:.1f} steps/sec")
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def create_checkpoint_comparison(checkpoint_num, step, loss, test_results, expected_answers):
|
||||
"""Create a checkpoint panel showing test results."""
|
||||
|
||||
# Count correct
|
||||
correct = 0
|
||||
for (q, actual), expected in zip(test_results, expected_answers):
|
||||
if actual.strip().lower() == expected.strip().lower():
|
||||
correct += 1
|
||||
|
||||
accuracy = (correct / len(test_results)) * 100
|
||||
|
||||
# Create results table
|
||||
table = Table(
|
||||
title=f"Checkpoint {checkpoint_num} - Step {step:,} | Loss: {loss:.4f} | Accuracy: {accuracy:.0f}%",
|
||||
box=box.ROUNDED,
|
||||
show_header=True
|
||||
)
|
||||
table.add_column("Question", style="cyan", width=22)
|
||||
table.add_column("Model Response", style="white", width=28)
|
||||
table.add_column("Status", justify="center", width=8)
|
||||
|
||||
for (question, actual), expected in zip(test_results, expected_answers):
|
||||
# Determine if correct
|
||||
is_correct = actual.strip().lower() == expected.strip().lower()
|
||||
is_close = expected.strip().lower() in actual.strip().lower() or actual.strip().lower() in expected.strip().lower()
|
||||
|
||||
# Color code and emoji
|
||||
if is_correct:
|
||||
status = "[green]✓ Perfect[/green]"
|
||||
response_style = "green"
|
||||
elif is_close:
|
||||
status = "[yellow]≈ Close[/yellow]"
|
||||
response_style = "yellow"
|
||||
elif len(actual.strip()) > 0:
|
||||
status = "[red]✗ Wrong[/red]"
|
||||
response_style = "red"
|
||||
else:
|
||||
status = "[dim]- Empty[/dim]"
|
||||
response_style = "dim"
|
||||
|
||||
# Truncate long responses
|
||||
display_response = actual[:26] + "..." if len(actual) > 26 else actual
|
||||
|
||||
table.add_row(
|
||||
question,
|
||||
f"[{response_style}]{display_response}[/{response_style}]",
|
||||
status
|
||||
)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def create_progress_panel(step, total_steps, checkpoint_num, total_checkpoints):
|
||||
"""Create progress indicators panel."""
|
||||
step_progress = (step / total_steps) * 100 if total_steps > 0 else 0
|
||||
checkpoint_progress = (checkpoint_num / total_checkpoints) * 100 if total_checkpoints > 0 else 0
|
||||
|
||||
# Progress bars (ASCII style)
|
||||
step_bar_filled = int(step_progress / 2.5) # 40 chars max
|
||||
step_bar = "[" + "=" * step_bar_filled + " " * (40 - step_bar_filled) + "]"
|
||||
|
||||
checkpoint_bar_filled = int(checkpoint_progress / 2.5)
|
||||
checkpoint_bar = "[" + "=" * checkpoint_bar_filled + " " * (40 - checkpoint_bar_filled) + "]"
|
||||
|
||||
text = (
|
||||
f"[bold]Training Progress:[/bold]\n"
|
||||
f"{step_bar} {step_progress:.1f}% ({step}/{total_steps} steps)\n\n"
|
||||
f"[bold]Checkpoints:[/bold]\n"
|
||||
f"{checkpoint_bar} {checkpoint_progress:.1f}% ({checkpoint_num}/{total_checkpoints} completed)"
|
||||
)
|
||||
|
||||
return Panel(text, title="📊 Progress", border_style="blue")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Training with Dashboard
|
||||
# ============================================================================
|
||||
|
||||
def train_with_dashboard(model, optimizer, loss_fn, train_data, test_questions, expected_answers,
|
||||
char_to_idx, idx_to_char, max_time_minutes=10, checkpoint_interval_steps=1500):
|
||||
"""
|
||||
Train with beautiful dashboard showing live progress.
|
||||
"""
|
||||
max_time_seconds = max_time_minutes * 60
|
||||
|
||||
console.clear()
|
||||
console.print(create_welcome_panel())
|
||||
console.print()
|
||||
|
||||
input("[bold cyan]Press ENTER to start training...[/bold cyan]")
|
||||
console.clear()
|
||||
|
||||
# Training setup
|
||||
start_time = time.time()
|
||||
losses = []
|
||||
step = 0
|
||||
checkpoint_num = 0
|
||||
|
||||
# Calculate expected checkpoints
|
||||
estimated_total_steps = int(max_time_seconds * 12) # ~12 steps/sec
|
||||
total_checkpoints = estimated_total_steps // checkpoint_interval_steps
|
||||
|
||||
# Initial evaluation
|
||||
console.print("\n[bold]📊 CHECKPOINT 0: Initial Model (Untrained)[/bold]\n")
|
||||
initial_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
|
||||
console.print(create_checkpoint_comparison(0, 0, 999.9, initial_results, expected_answers))
|
||||
console.print()
|
||||
|
||||
console.print("[dim]Starting training... Watch the responses improve![/dim]\n")
|
||||
time.sleep(2)
|
||||
|
||||
next_checkpoint = checkpoint_interval_steps
|
||||
last_print_time = time.time()
|
||||
|
||||
# Training loop
|
||||
while True:
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed >= max_time_seconds:
|
||||
break
|
||||
|
||||
# Training step
|
||||
tokens = train_data[np.random.randint(len(train_data))]
|
||||
input_seq = tokens[:-1]
|
||||
target_seq = tokens[1:]
|
||||
|
||||
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
|
||||
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
|
||||
|
||||
logits = model.forward(x)
|
||||
|
||||
batch_size, seq_len, vocab_size = logits.shape
|
||||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
|
||||
targets_flat = y_true.reshape(batch_size * seq_len)
|
||||
loss = loss_fn.forward(logits_flat, targets_flat)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
np.clip(param.grad, -1.0, 1.0, out=param.grad)
|
||||
|
||||
optimizer.step()
|
||||
|
||||
losses.append(loss.data.item())
|
||||
step += 1
|
||||
|
||||
# Print progress every 5 seconds
|
||||
if time.time() - last_print_time >= 5.0:
|
||||
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
steps_per_sec = step / elapsed
|
||||
console.print(
|
||||
f"[dim]Step {step:5d} | "
|
||||
f"Loss: {avg_loss:.4f} | "
|
||||
f"Time: {int(elapsed/60)}m{int(elapsed%60):02d}s | "
|
||||
f"Speed: {steps_per_sec:.1f} steps/sec[/dim]"
|
||||
)
|
||||
last_print_time = time.time()
|
||||
|
||||
# Checkpoint evaluation
|
||||
if step >= next_checkpoint:
|
||||
checkpoint_num += 1
|
||||
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
|
||||
console.print("\n" + "="*70)
|
||||
console.print(f"[bold yellow]⏸️ CHECKPOINT {checkpoint_num}[/bold yellow]")
|
||||
console.print(f"[dim]Pausing training to evaluate... (Step {step:,})[/dim]\n")
|
||||
|
||||
# Evaluate
|
||||
current_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
|
||||
|
||||
# Show results
|
||||
console.print(create_checkpoint_comparison(checkpoint_num, step, avg_loss, current_results, expected_answers))
|
||||
console.print()
|
||||
|
||||
# Show progress
|
||||
console.print(create_progress_panel(step, estimated_total_steps, checkpoint_num, total_checkpoints))
|
||||
console.print()
|
||||
|
||||
console.print("[dim]Continuing training...[/dim]\n")
|
||||
next_checkpoint += checkpoint_interval_steps
|
||||
time.sleep(1)
|
||||
|
||||
# Final results
|
||||
final_elapsed = time.time() - start_time
|
||||
final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
initial_loss = np.mean(losses[:10])
|
||||
improvement = (1 - final_loss / initial_loss) * 100
|
||||
|
||||
console.print("\n" + "="*70)
|
||||
console.print("[bold green]🎉 TRAINING COMPLETE![/bold green]\n")
|
||||
|
||||
# Final evaluation
|
||||
final_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
|
||||
console.print(create_checkpoint_comparison("FINAL", step, final_loss, final_results, expected_answers))
|
||||
console.print()
|
||||
|
||||
# Summary table
|
||||
summary = Table(title="Training Summary", box=box.DOUBLE, show_header=True)
|
||||
summary.add_column("Metric", style="cyan", width=30)
|
||||
summary.add_column("Value", style="green bold", width=30)
|
||||
|
||||
summary.add_row("Total Training Time", f"{final_elapsed/60:.1f} minutes")
|
||||
summary.add_row("Total Steps", f"{step:,}")
|
||||
summary.add_row("Steps/Second", f"{step/final_elapsed:.1f}")
|
||||
summary.add_row("Initial Loss", f"{initial_loss:.4f}")
|
||||
summary.add_row("Final Loss", f"{final_loss:.4f}")
|
||||
summary.add_row("Improvement", f"{improvement:.1f}%")
|
||||
summary.add_row("Checkpoints Evaluated", f"{checkpoint_num}")
|
||||
|
||||
console.print(summary)
|
||||
console.print()
|
||||
|
||||
# Count perfect responses for milestone card
|
||||
correct = sum(1 for (q, actual), expected in zip(final_results, expected_answers)
|
||||
if actual.strip().lower() == expected.strip().lower())
|
||||
accuracy = (correct / len(test_questions)) * 100
|
||||
|
||||
return losses, step, accuracy
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
# Dataset
|
||||
conversations = create_tinytalks_dataset()
|
||||
char_to_idx, idx_to_char = create_tokenizer(conversations)
|
||||
vocab_size = len(idx_to_char)
|
||||
|
||||
# Encode
|
||||
max_seq_len = 80
|
||||
train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
|
||||
|
||||
# Test questions and expected answers
|
||||
test_questions = [
|
||||
"Hi",
|
||||
"How are you",
|
||||
"What is your name",
|
||||
"What is the sky",
|
||||
"Is grass green",
|
||||
"What is 1 plus 1",
|
||||
"Are you happy"
|
||||
]
|
||||
|
||||
expected_answers = [
|
||||
"Hello! How can I help you?",
|
||||
"I am doing well, thanks!",
|
||||
"I am TinyBot",
|
||||
"The sky is blue",
|
||||
"Yes, grass is green",
|
||||
"1 plus 1 equals 2",
|
||||
"Yes, I am happy"
|
||||
]
|
||||
|
||||
# Model
|
||||
config = {
|
||||
'vocab_size': vocab_size,
|
||||
'embed_dim': 16,
|
||||
'num_layers': 1,
|
||||
'num_heads': 2,
|
||||
'max_seq_len': max_seq_len,
|
||||
}
|
||||
|
||||
model = GPT(**config)
|
||||
num_params = sum(np.prod(p.shape) for p in model.parameters())
|
||||
|
||||
# Optimizer
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Train with dashboard
|
||||
train_time = 15 # 15 minutes for better results
|
||||
checkpoint_interval = 2000 # Every ~2.5 minutes
|
||||
|
||||
console.print(Panel.fit(
|
||||
f"[bold]Model:[/bold] {num_params:,} parameters (ultra-tiny!)\n"
|
||||
f"[bold]Training Time:[/bold] {train_time} minutes\n"
|
||||
f"[bold]Checkpoints:[/bold] Every {checkpoint_interval} steps (~2 min)\n"
|
||||
f"[bold]Test Questions:[/bold] {len(test_questions)} questions\n\n"
|
||||
f"[dim]Watch loss decrease and responses improve![/dim]",
|
||||
title="⚙️ Configuration",
|
||||
border_style="blue"
|
||||
))
|
||||
|
||||
losses, total_steps, final_accuracy = train_with_dashboard(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
loss_fn=loss_fn,
|
||||
train_data=train_data,
|
||||
test_questions=test_questions,
|
||||
expected_answers=expected_answers,
|
||||
char_to_idx=char_to_idx,
|
||||
idx_to_char=idx_to_char,
|
||||
max_time_minutes=train_time,
|
||||
checkpoint_interval_steps=checkpoint_interval
|
||||
)
|
||||
|
||||
# Calculate metrics for milestone card
|
||||
loss_improvement = (1 - np.mean(losses[-100:]) / np.mean(losses[:10])) * 100
|
||||
|
||||
# Milestone completion card
|
||||
console.print()
|
||||
if final_accuracy >= 50 and loss_improvement >= 80:
|
||||
console.print(Panel.fit(
|
||||
"[bold green]🎉 Congratulations! You've Built a Working Chatbot![/bold green]\n\n"
|
||||
|
||||
f"Final accuracy: [bold]{final_accuracy:.0f}%[/bold] | "
|
||||
f"Loss improved: [bold]{loss_improvement:.1f}%[/bold]\n\n"
|
||||
|
||||
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
|
||||
|
||||
"[bold]💡 What YOU Just Accomplished:[/bold]\n"
|
||||
" ✓ Built a TRANSFORMER (2017 Vaswani et al)\n"
|
||||
" ✓ Trained with attention mechanism from scratch\n"
|
||||
" ✓ Watched AI learn language patterns in real-time\n"
|
||||
" ✓ Demonstrated gradient descent on complex architectures\n"
|
||||
f" ✓ Trained {total_steps:,} steps in {train_time} minutes!\n\n"
|
||||
|
||||
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
|
||||
|
||||
"[bold]🎓 Why This Matters:[/bold]\n"
|
||||
" This is the SAME architecture behind ChatGPT, GPT-4, and BERT.\n"
|
||||
" You just witnessed the magic of:\n"
|
||||
" • Self-attention (learning relationships between words)\n"
|
||||
" • Position encoding (understanding word order)\n"
|
||||
" • Autoregressive generation (predicting next token)\n\n"
|
||||
|
||||
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
|
||||
|
||||
"[bold]📌 The Key Insight:[/bold]\n"
|
||||
" You saw responses evolve from gibberish to coherent:\n"
|
||||
" Checkpoint 0: Random noise\n"
|
||||
" Checkpoint 1: Recognizable words\n"
|
||||
" Checkpoint 2: Partial sentences\n"
|
||||
" Final: Perfect responses!\n"
|
||||
" \n"
|
||||
" [yellow]Scale it up:[/yellow] Same process, more data, more params →\n"
|
||||
" You get GPT-4 (175B params, trained for weeks)!\n\n"
|
||||
|
||||
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
|
||||
|
||||
"[bold]🚀 What You Can Do Now:[/bold]\n"
|
||||
"• Experiment with different architectures (layers, heads)\n"
|
||||
"• Try longer training (15-20 minutes for better results)\n"
|
||||
"• Add more conversation patterns to the dataset\n"
|
||||
"• Scale up the model (more parameters = better learning)\n\n"
|
||||
|
||||
"[bold cyan]You've mastered the foundation of modern AI! 🌟[/bold cyan]",
|
||||
|
||||
title="🌟 2017 Transformer Complete - Milestone 05",
|
||||
border_style="green",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
else:
|
||||
console.print(Panel.fit(
|
||||
"[bold yellow]⚠️ Training Complete - Needs More Time[/bold yellow]\n\n"
|
||||
f"Current accuracy: {final_accuracy:.0f}% | Loss improved: {loss_improvement:.1f}%\n\n"
|
||||
"Your transformer is learning but needs more training time.\n\n"
|
||||
"[bold]What to try:[/bold]\n"
|
||||
"• Train for 15-20 minutes instead of 10\n"
|
||||
"• Use a slightly bigger model (2 layers, 24 dims)\n"
|
||||
"• Add more data repetition for reinforcement\n\n"
|
||||
"[dim]The attention mechanism is working - it just needs more steps to converge!\n"
|
||||
"Even partial success shows the transformer learned patterns.[/dim]",
|
||||
title="🔄 Learning in Progress",
|
||||
border_style="yellow",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,208 +0,0 @@
|
||||
"""
|
||||
TinyTalks Dataset - Small Conversational Dataset for Transformer Training
|
||||
==========================================================================
|
||||
|
||||
A carefully curated micro-dataset for training a chatbot in 10-15 minutes.
|
||||
|
||||
Design Principles:
|
||||
- Simple Q&A format
|
||||
- Short responses (< 50 characters)
|
||||
- Common patterns repeated for learning
|
||||
- Character-level tokenization friendly
|
||||
- Covers basic conversational topics
|
||||
"""
|
||||
|
||||
def create_tinytalks_dataset():
|
||||
"""
|
||||
Create a small but diverse conversational dataset.
|
||||
|
||||
Categories:
|
||||
1. Greetings & Politeness
|
||||
2. Simple Facts
|
||||
3. Yes/No Questions
|
||||
4. Weather & Time
|
||||
5. Feelings & States
|
||||
6. Simple Math
|
||||
7. Colors & Objects
|
||||
8. Names & Identity
|
||||
"""
|
||||
|
||||
conversations = [
|
||||
# === GREETINGS (repeated 5x for strong learning) ===
|
||||
("Hi", "Hello! How can I help you?"),
|
||||
("Hello", "Hi there! How are you?"),
|
||||
("Good morning", "Good morning! Nice day!"),
|
||||
("How are you", "I am doing well, thanks!"),
|
||||
("Bye", "Goodbye! Have a great day!"),
|
||||
|
||||
# Repeat for learning
|
||||
("Hi", "Hello! How can I help you?"),
|
||||
("Hello", "Hi there! How are you?"),
|
||||
("How are you", "I am doing well, thanks!"),
|
||||
("Bye", "Goodbye! Have a great day!"),
|
||||
|
||||
("Hi", "Hello! How can I help you?"),
|
||||
("Hello", "Hi there! How are you?"),
|
||||
("Bye", "Goodbye! Have a great day!"),
|
||||
|
||||
# === SIMPLE FACTS (repeated 3x) ===
|
||||
("What is the sky", "The sky is blue"),
|
||||
("What is grass", "Grass is green"),
|
||||
("What is the sun", "The sun is bright and yellow"),
|
||||
("What is water", "Water is wet and clear"),
|
||||
("What is fire", "Fire is hot and red"),
|
||||
|
||||
("What is the sky", "The sky is blue"),
|
||||
("What is grass", "Grass is green"),
|
||||
("What is water", "Water is wet and clear"),
|
||||
|
||||
("What is the sky", "The sky is blue"),
|
||||
("What is grass", "Grass is green"),
|
||||
|
||||
# === YES/NO QUESTIONS (repeated 3x) ===
|
||||
("Is the sky blue", "Yes, the sky is blue"),
|
||||
("Is grass green", "Yes, grass is green"),
|
||||
("Is the sun cold", "No, the sun is hot"),
|
||||
("Is water dry", "No, water is wet"),
|
||||
("Is fire safe", "No, fire is dangerous"),
|
||||
|
||||
("Is the sky blue", "Yes, the sky is blue"),
|
||||
("Is grass green", "Yes, grass is green"),
|
||||
("Is the sun cold", "No, the sun is hot"),
|
||||
|
||||
("Is the sky blue", "Yes, the sky is blue"),
|
||||
("Is grass green", "Yes, grass is green"),
|
||||
|
||||
# === WEATHER (repeated 3x) ===
|
||||
("Is it sunny", "Yes, it is sunny today"),
|
||||
("Is it raining", "No, it is not raining"),
|
||||
("Is it cold", "No, it is warm today"),
|
||||
("What is the weather", "The weather is nice"),
|
||||
|
||||
("Is it sunny", "Yes, it is sunny today"),
|
||||
("What is the weather", "The weather is nice"),
|
||||
|
||||
("Is it sunny", "Yes, it is sunny today"),
|
||||
|
||||
# === FEELINGS (repeated 3x) ===
|
||||
("Are you happy", "Yes, I am happy"),
|
||||
("Are you sad", "No, I am not sad"),
|
||||
("Are you tired", "No, I feel good"),
|
||||
("Do you like learning", "Yes, I love learning"),
|
||||
|
||||
("Are you happy", "Yes, I am happy"),
|
||||
("Do you like learning", "Yes, I love learning"),
|
||||
|
||||
("Are you happy", "Yes, I am happy"),
|
||||
|
||||
# === SIMPLE MATH (repeated 3x) ===
|
||||
("What is 1 plus 1", "1 plus 1 equals 2"),
|
||||
("What is 2 plus 2", "2 plus 2 equals 4"),
|
||||
("What is 3 plus 3", "3 plus 3 equals 6"),
|
||||
("What is 5 plus 5", "5 plus 5 equals 10"),
|
||||
|
||||
("What is 1 plus 1", "1 plus 1 equals 2"),
|
||||
("What is 2 plus 2", "2 plus 2 equals 4"),
|
||||
|
||||
("What is 1 plus 1", "1 plus 1 equals 2"),
|
||||
|
||||
# === COLORS (repeated 3x) ===
|
||||
("What color is the sky", "The sky is blue"),
|
||||
("What color is grass", "Grass is green"),
|
||||
("What color is the sun", "The sun is yellow"),
|
||||
("What color is snow", "Snow is white"),
|
||||
|
||||
("What color is the sky", "The sky is blue"),
|
||||
("What color is grass", "Grass is green"),
|
||||
|
||||
("What color is the sky", "The sky is blue"),
|
||||
|
||||
# === IDENTITY (repeated 3x) ===
|
||||
("What is your name", "I am TinyBot"),
|
||||
("Who are you", "I am TinyBot, your helper"),
|
||||
("What do you do", "I help answer questions"),
|
||||
|
||||
("What is your name", "I am TinyBot"),
|
||||
("Who are you", "I am TinyBot, your helper"),
|
||||
|
||||
("What is your name", "I am TinyBot"),
|
||||
|
||||
# === CAPABILITIES (repeated 2x) ===
|
||||
("Can you help me", "Yes, I can help you"),
|
||||
("Can you talk", "Yes, I can talk with you"),
|
||||
("Do you understand", "Yes, I understand you"),
|
||||
|
||||
("Can you help me", "Yes, I can help you"),
|
||||
("Can you talk", "Yes, I can talk with you"),
|
||||
]
|
||||
|
||||
return conversations
|
||||
|
||||
|
||||
def get_dataset_stats():
|
||||
"""Get statistics about the dataset."""
|
||||
conversations = create_tinytalks_dataset()
|
||||
|
||||
unique_conversations = set(conversations)
|
||||
total_chars = sum(len(q) + len(a) for q, a in conversations)
|
||||
avg_question_len = sum(len(q) for q, _ in conversations) / len(conversations)
|
||||
avg_answer_len = sum(len(a) for _, a in conversations) / len(conversations)
|
||||
|
||||
return {
|
||||
'total_examples': len(conversations),
|
||||
'unique_examples': len(unique_conversations),
|
||||
'repetition_factor': len(conversations) / len(unique_conversations),
|
||||
'total_chars': total_chars,
|
||||
'avg_question_len': avg_question_len,
|
||||
'avg_answer_len': avg_answer_len,
|
||||
'categories': [
|
||||
'Greetings (5x repeat)',
|
||||
'Simple Facts (3x repeat)',
|
||||
'Yes/No Questions (3x repeat)',
|
||||
'Weather (3x repeat)',
|
||||
'Feelings (3x repeat)',
|
||||
'Simple Math (3x repeat)',
|
||||
'Colors (3x repeat)',
|
||||
'Identity (3x repeat)',
|
||||
'Capabilities (2x repeat)'
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def print_dataset_info():
|
||||
"""Print dataset information."""
|
||||
conversations = create_tinytalks_dataset()
|
||||
stats = get_dataset_stats()
|
||||
|
||||
print("=" * 70)
|
||||
print("TINYTALKS DATASET")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print(f"Total examples: {stats['total_examples']}")
|
||||
print(f"Unique examples: {stats['unique_examples']}")
|
||||
print(f"Repetition factor: {stats['repetition_factor']:.1f}x")
|
||||
print(f"Average question length: {stats['avg_question_len']:.1f} chars")
|
||||
print(f"Average answer length: {stats['avg_answer_len']:.1f} chars")
|
||||
print()
|
||||
print("Categories:")
|
||||
for cat in stats['categories']:
|
||||
print(f" • {cat}")
|
||||
print()
|
||||
print("Sample conversations:")
|
||||
print("-" * 70)
|
||||
|
||||
# Show 10 random unique examples
|
||||
unique = list(set(conversations))
|
||||
import random
|
||||
random.seed(42)
|
||||
samples = random.sample(unique, min(10, len(unique)))
|
||||
|
||||
for q, a in samples:
|
||||
print(f"Q: {q}")
|
||||
print(f"A: {a}")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print_dataset_info()
|
||||
|
||||
@@ -1,746 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TinyTalks Q&A Generation (2017) - Transformer Era
|
||||
==================================================
|
||||
|
||||
📚 HISTORICAL CONTEXT:
|
||||
In 2017, Vaswani et al. published "Attention Is All You Need", showing that
|
||||
attention mechanisms alone (no RNNs!) could achieve state-of-the-art results
|
||||
on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs.
|
||||
|
||||
🎯 WHAT YOU'RE BUILDING:
|
||||
Using YOUR TinyTorch implementations, you'll build a character-level conversational
|
||||
model that learns to answer questions - proving YOUR attention mechanism works!
|
||||
|
||||
TinyTalks is PERFECT for learning:
|
||||
- Small dataset (17.5 KB) = 3-5 minute training!
|
||||
- Clear Q&A format (easy to verify learning)
|
||||
- Progressive difficulty (5 levels)
|
||||
- Instant gratification: Watch your transformer learn to chat!
|
||||
|
||||
✅ REQUIRED MODULES (Run after Module 13):
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
Module 01 (Tensor) : YOUR data structure with autograd
|
||||
Module 02 (Activations) : YOUR ReLU and GELU activations
|
||||
Module 03 (Layers) : YOUR Linear layers
|
||||
Module 04 (Losses) : YOUR CrossEntropyLoss
|
||||
Module 05 (Autograd) : YOUR automatic differentiation
|
||||
Module 06 (Optimizers) : YOUR Adam optimizer
|
||||
Module 08 (DataLoader) : YOUR data batching
|
||||
Module 10 (Tokenization) : YOUR CharTokenizer for text→numbers
|
||||
Module 11 (Embeddings) : YOUR token & positional embeddings
|
||||
Module 12 (Attention) : YOUR multi-head self-attention
|
||||
Module 13 (Transformers) : YOUR LayerNorm + TransformerBlock + GPT
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
🏗️ ARCHITECTURE (Character-Level Q&A Model):
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Output Predictions │
|
||||
│ Character Probabilities (vocab_size) │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Output Projection │
|
||||
│ Module 03: vectors → vocabulary │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Layer Norm │
|
||||
│ Module 13: Final normalization │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
╔══════════════════════════════════════════════════════════════════════════════╗
|
||||
║ Transformer Block × N (Repeat) ║
|
||||
║ ┌────────────────────────────────────────────────────────────────────────┐ ║
|
||||
║ │ Feed Forward Network │ ║
|
||||
║ │ Module 03: Linear → GELU → Linear │ ║
|
||||
║ └────────────────────────────────────────────────────────────────────────┘ ║
|
||||
║ ▲ ║
|
||||
║ ┌────────────────────────────────────────────────────────────────────────┐ ║
|
||||
║ │ Multi-Head Self-Attention │ ║
|
||||
║ │ Module 12: Query·Key^T·Value across all positions │ ║
|
||||
║ └────────────────────────────────────────────────────────────────────────┘ ║
|
||||
╚══════════════════════════════════════════════════════════════════════════════╝
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Positional Encoding │
|
||||
│ Module 11: Add position information │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Character Embeddings │
|
||||
│ Module 11: chars → embed_dim vectors │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
▲
|
||||
┌──────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Input Characters │
|
||||
│ "Q: What color is the sky? A:" │
|
||||
└──────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
📊 EXPECTED PERFORMANCE:
|
||||
- Dataset: 17.5 KB TinyTalks (301 Q&A pairs, 5 difficulty levels)
|
||||
- Training time: 3-5 minutes (instant gratification!)
|
||||
- Vocabulary: ~68 unique characters (simple English Q&A)
|
||||
- Expected: 70-80% accuracy on Level 1-2 questions after training
|
||||
- Parameters: ~1.2M (perfect size for fast learning on small data)
|
||||
|
||||
💡 WHAT TO WATCH FOR:
|
||||
- Epoch 1-3: Model learns Q&A structure ("A:" follows "Q:")
|
||||
- Epoch 4-7: Starts giving sensible (if incorrect) answers
|
||||
- Epoch 8-12: 50-60% accuracy on simple questions
|
||||
- Epoch 13-20: 70-80% accuracy, proper grammar
|
||||
- Success = "Wow, my transformer actually learned to answer questions!"
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import numpy as np
|
||||
import argparse
|
||||
import time
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
# Add project root to path
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(project_root)
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def print_banner():
|
||||
"""Print a beautiful banner for the milestone"""
|
||||
banner_text = """
|
||||
╔══════════════════════════════════════════════════════════════════╗
|
||||
║ ║
|
||||
║ 🤖 TinyTalks Q&A Bot Training (2017) ║
|
||||
║ Transformer Architecture ║
|
||||
║ ║
|
||||
║ "Your first transformer learning to answer questions!" ║
|
||||
║ ║
|
||||
╚══════════════════════════════════════════════════════════════════╝
|
||||
"""
|
||||
console.print(Panel(banner_text, border_style="bright_blue", box=box.DOUBLE))
|
||||
|
||||
|
||||
def filter_by_levels(text, levels):
|
||||
"""
|
||||
Filter TinyTalks dataset to only include specified difficulty levels.
|
||||
|
||||
Levels are marked in the original generation as:
|
||||
L1: Greetings (47 pairs)
|
||||
L2: Facts (82 pairs)
|
||||
L3: Math (45 pairs)
|
||||
L4: Reasoning (87 pairs)
|
||||
L5: Context (40 pairs)
|
||||
|
||||
For simplicity, we filter by common patterns:
|
||||
L1: Hello, Hi, What is your name, etc.
|
||||
L2: What color, How many, etc.
|
||||
L3: What is X plus/minus, etc.
|
||||
"""
|
||||
if levels is None or levels == [1, 2, 3, 4, 5]:
|
||||
return text # Use full dataset
|
||||
|
||||
# Parse Q&A pairs
|
||||
pairs = []
|
||||
blocks = text.strip().split('\n\n')
|
||||
|
||||
for block in blocks:
|
||||
lines = block.strip().split('\n')
|
||||
if len(lines) == 2 and lines[0].startswith('Q:') and lines[1].startswith('A:'):
|
||||
q = lines[0][3:].strip()
|
||||
a = lines[1][3:].strip()
|
||||
|
||||
# Classify level (heuristic)
|
||||
level = 5 # default
|
||||
q_lower = q.lower()
|
||||
|
||||
if any(word in q_lower for word in ['hello', 'hi', 'hey', 'goodbye', 'bye', 'name', 'who are you', 'what are you']):
|
||||
level = 1
|
||||
elif any(word in q_lower for word in ['color', 'legs', 'days', 'months', 'sound', 'capital']):
|
||||
level = 2
|
||||
elif any(word in q_lower for word in ['plus', 'minus', 'times', 'divided', 'equals']):
|
||||
level = 3
|
||||
elif any(word in q_lower for word in ['use', 'where do', 'what do', 'happens if', 'need to']):
|
||||
level = 4
|
||||
|
||||
if level in levels:
|
||||
pairs.append(f"Q: {q}\nA: {a}")
|
||||
|
||||
filtered_text = '\n\n'.join(pairs)
|
||||
console.print(f"[yellow]📊 Filtered to Level(s) {levels}:[/yellow]")
|
||||
console.print(f" Q&A pairs: {len(pairs)}")
|
||||
console.print(f" Characters: {len(filtered_text)}")
|
||||
|
||||
return filtered_text
|
||||
|
||||
|
||||
class TinyTalksDataset:
|
||||
"""
|
||||
Character-level dataset for TinyTalks Q&A.
|
||||
|
||||
Creates sequences of characters for autoregressive language modeling:
|
||||
- Input: "Q: What color is the sky? A: The sk"
|
||||
- Target: ": What color is the sky? A: The sky"
|
||||
|
||||
The model learns to predict the next character given previous characters,
|
||||
naturally learning the Q&A pattern.
|
||||
"""
|
||||
|
||||
def __init__(self, text, seq_length=64, levels=None):
|
||||
"""
|
||||
Args:
|
||||
text: Full text string (Q&A pairs)
|
||||
seq_length: Length of input sequences
|
||||
levels: List of difficulty levels to include (1-5), None = all
|
||||
"""
|
||||
from tinytorch.text.tokenization import CharTokenizer
|
||||
|
||||
self.seq_length = seq_length
|
||||
|
||||
# Filter by levels if specified
|
||||
if levels:
|
||||
text = filter_by_levels(text, levels)
|
||||
|
||||
# Store original text for testing
|
||||
self.text = text
|
||||
|
||||
# Build character vocabulary using CharTokenizer
|
||||
self.tokenizer = CharTokenizer()
|
||||
self.tokenizer.build_vocab([text])
|
||||
|
||||
# Encode entire text
|
||||
self.data = self.tokenizer.encode(text)
|
||||
|
||||
console.print(f"[green]✓[/green] Dataset initialized:")
|
||||
console.print(f" Total characters: {len(text)}")
|
||||
console.print(f" Vocabulary size: {self.tokenizer.vocab_size}")
|
||||
console.print(f" Sequence length: {seq_length}")
|
||||
console.print(f" Total sequences: {len(self)}")
|
||||
|
||||
def __len__(self):
|
||||
"""Number of possible sequences"""
|
||||
return len(self.data) - self.seq_length
|
||||
|
||||
def __getitem__(self, idx):
|
||||
"""
|
||||
Get one training example.
|
||||
|
||||
Returns:
|
||||
input_seq: Characters [idx : idx+seq_length]
|
||||
target_seq: Characters [idx+1 : idx+seq_length+1] (shifted by 1)
|
||||
"""
|
||||
input_seq = self.data[idx:idx + self.seq_length]
|
||||
target_seq = self.data[idx + 1:idx + self.seq_length + 1]
|
||||
return input_seq, target_seq
|
||||
|
||||
def decode(self, indices):
|
||||
"""Decode token indices back to text"""
|
||||
return self.tokenizer.decode(indices)
|
||||
|
||||
|
||||
class TinyGPT:
|
||||
"""
|
||||
Character-level GPT model for TinyTalks Q&A.
|
||||
|
||||
This is a simplified GPT architecture:
|
||||
1. Token embeddings (convert characters to vectors)
|
||||
2. Positional encodings (add position information)
|
||||
3. N transformer blocks (self-attention + feed-forward)
|
||||
4. Output projection (vectors back to character probabilities)
|
||||
|
||||
Built entirely from YOUR TinyTorch modules!
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_size, embed_dim=128, num_layers=4, num_heads=4,
|
||||
max_seq_len=64, dropout=0.1):
|
||||
"""
|
||||
Args:
|
||||
vocab_size: Number of unique characters
|
||||
embed_dim: Dimension of embeddings and hidden states
|
||||
num_layers: Number of transformer blocks
|
||||
num_heads: Number of attention heads per block
|
||||
max_seq_len: Maximum sequence length
|
||||
dropout: Dropout probability (for training)
|
||||
"""
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.text.embeddings import Embedding, PositionalEncoding
|
||||
from tinytorch.models.transformer import LayerNorm, TransformerBlock
|
||||
from tinytorch.core.layers import Linear
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_layers = num_layers
|
||||
self.num_heads = num_heads
|
||||
self.max_seq_len = max_seq_len
|
||||
|
||||
# 1. Token embeddings: char_id → embed_dim vector
|
||||
self.token_embedding = Embedding(vocab_size, embed_dim)
|
||||
|
||||
# 2. Positional encoding: add position information
|
||||
self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
|
||||
|
||||
# 3. Transformer blocks (stacked)
|
||||
self.blocks = []
|
||||
for _ in range(num_layers):
|
||||
block = TransformerBlock(
|
||||
embed_dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=4, # FFN hidden_dim = 4 * embed_dim
|
||||
dropout_prob=dropout
|
||||
)
|
||||
self.blocks.append(block)
|
||||
|
||||
# 4. Final layer normalization
|
||||
self.ln_f = LayerNorm(embed_dim)
|
||||
|
||||
# 5. Output projection: embed_dim → vocab_size
|
||||
self.output_proj = Linear(embed_dim, vocab_size)
|
||||
|
||||
console.print(f"[green]✓[/green] TinyGPT model initialized:")
|
||||
console.print(f" Vocabulary: {vocab_size}")
|
||||
console.print(f" Embedding dim: {embed_dim}")
|
||||
console.print(f" Layers: {num_layers}")
|
||||
console.print(f" Heads: {num_heads}")
|
||||
console.print(f" Max sequence: {max_seq_len}")
|
||||
|
||||
# Count parameters
|
||||
total_params = self.count_parameters()
|
||||
console.print(f" [bold]Total parameters: {total_params:,}[/bold]")
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Forward pass through the model.
|
||||
|
||||
Args:
|
||||
x: Input tensor of shape (batch, seq_len) with token indices
|
||||
|
||||
Returns:
|
||||
logits: Output tensor of shape (batch, seq_len, vocab_size)
|
||||
"""
|
||||
from tinytorch.core.tensor import Tensor
|
||||
|
||||
# 1. Token embeddings: (batch, seq_len) → (batch, seq_len, embed_dim)
|
||||
x = self.token_embedding.forward(x)
|
||||
|
||||
# 2. Add positional encoding
|
||||
x = self.pos_encoding.forward(x)
|
||||
|
||||
# 3. Pass through transformer blocks
|
||||
for block in self.blocks:
|
||||
x = block.forward(x)
|
||||
|
||||
# 4. Final layer norm
|
||||
x = self.ln_f.forward(x)
|
||||
|
||||
# 5. Project to vocabulary: (batch, seq_len, embed_dim) → (batch, seq_len, vocab_size)
|
||||
logits = self.output_proj.forward(x)
|
||||
|
||||
return logits
|
||||
|
||||
def parameters(self):
|
||||
"""Get all trainable parameters"""
|
||||
params = []
|
||||
|
||||
# Token embeddings
|
||||
params.extend(self.token_embedding.parameters())
|
||||
|
||||
# Positional encoding (learnable parameters)
|
||||
params.extend(self.pos_encoding.parameters())
|
||||
|
||||
# Transformer blocks
|
||||
for block in self.blocks:
|
||||
params.extend(block.parameters())
|
||||
|
||||
# Final layer norm
|
||||
params.extend(self.ln_f.parameters())
|
||||
|
||||
# Output projection
|
||||
params.extend(self.output_proj.parameters())
|
||||
|
||||
# Ensure all require gradients
|
||||
for param in params:
|
||||
param.requires_grad = True
|
||||
|
||||
return params
|
||||
|
||||
def count_parameters(self):
|
||||
"""Count total trainable parameters"""
|
||||
total = 0
|
||||
for param in self.parameters():
|
||||
total += param.data.size
|
||||
return total
|
||||
|
||||
def generate(self, tokenizer, prompt="Q:", max_new_tokens=100, temperature=1.0):
|
||||
"""
|
||||
Generate text autoregressively.
|
||||
|
||||
Args:
|
||||
tokenizer: CharTokenizer for encoding/decoding
|
||||
prompt: Starting text
|
||||
max_new_tokens: How many characters to generate
|
||||
temperature: Sampling temperature (higher = more random)
|
||||
|
||||
Returns:
|
||||
Generated text string
|
||||
"""
|
||||
from tinytorch.core.tensor import Tensor
|
||||
|
||||
# Encode prompt
|
||||
indices = tokenizer.encode(prompt)
|
||||
|
||||
# Generate tokens one at a time
|
||||
for _ in range(max_new_tokens):
|
||||
# Get last max_seq_len tokens (context window)
|
||||
context = indices[-self.max_seq_len:]
|
||||
|
||||
# Prepare input: (1, seq_len)
|
||||
x_input = Tensor(np.array([context]))
|
||||
|
||||
# Forward pass
|
||||
logits = self.forward(x_input)
|
||||
|
||||
# Get logits for last position: (vocab_size,)
|
||||
last_logits = logits.data[0, -1, :] / temperature
|
||||
|
||||
# Apply softmax to get probabilities
|
||||
exp_logits = np.exp(last_logits - np.max(last_logits))
|
||||
probs = exp_logits / np.sum(exp_logits)
|
||||
|
||||
# Sample from distribution
|
||||
next_idx = np.random.choice(len(probs), p=probs)
|
||||
|
||||
# Append to sequence
|
||||
indices.append(next_idx)
|
||||
|
||||
# Stop if we generate newline after "A:"
|
||||
if len(indices) > 3 and tokenizer.decode(indices[-3:]) == "\n\nQ":
|
||||
break
|
||||
|
||||
return tokenizer.decode(indices)
|
||||
|
||||
|
||||
def test_model_predictions(model, dataset, test_prompts=None):
|
||||
"""Test model on specific prompts and show predictions"""
|
||||
if test_prompts is None:
|
||||
test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: Hi!"]
|
||||
|
||||
console.print("\n[bold yellow]🧪 Testing Live Predictions:[/bold yellow]")
|
||||
for prompt in test_prompts:
|
||||
try:
|
||||
full_prompt = prompt + "\nA:"
|
||||
response = model.generate(dataset.tokenizer, prompt=full_prompt, max_new_tokens=30, temperature=0.5)
|
||||
|
||||
# Extract just the answer
|
||||
if "\nA:" in response:
|
||||
answer = response.split("\nA:")[1].split("\n")[0].strip()
|
||||
else:
|
||||
answer = response[len(full_prompt):].strip()
|
||||
|
||||
console.print(f" {prompt}")
|
||||
console.print(f" → [cyan]{answer}[/cyan]")
|
||||
except Exception as e:
|
||||
console.print(f" {prompt} → [red]Error: {str(e)[:50]}[/red]")
|
||||
|
||||
|
||||
def train_tinytalks_gpt(model, dataset, optimizer, criterion, epochs=20, batch_size=32,
|
||||
log_interval=50, test_prompts=None):
|
||||
"""
|
||||
Train the TinyGPT model on TinyTalks dataset.
|
||||
|
||||
Training loop:
|
||||
1. Sample random batch of sequences
|
||||
2. Forward pass: predict next character for each position
|
||||
3. Compute cross-entropy loss
|
||||
4. Backward pass: compute gradients
|
||||
5. Update parameters with Adam
|
||||
6. Periodically test on sample questions to show learning
|
||||
|
||||
Args:
|
||||
model: TinyGPT instance
|
||||
dataset: TinyTalksDataset instance
|
||||
optimizer: Adam optimizer
|
||||
criterion: CrossEntropyLoss
|
||||
epochs: Number of training epochs
|
||||
batch_size: Number of sequences per batch
|
||||
log_interval: Print loss every N batches
|
||||
test_prompts: Optional list of questions to test during training
|
||||
"""
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
|
||||
# Enable autograd
|
||||
enable_autograd()
|
||||
|
||||
console.print("\n[bold cyan]Starting Training...[/bold cyan]")
|
||||
console.print(f" Epochs: {epochs}")
|
||||
console.print(f" Batch size: {batch_size}")
|
||||
console.print(f" Dataset size: {len(dataset)} sequences")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
for epoch in range(epochs):
|
||||
epoch_start = time.time()
|
||||
epoch_loss = 0.0
|
||||
num_batches = 0
|
||||
|
||||
# Calculate batches per epoch
|
||||
batches_per_epoch = min(500, len(dataset) // batch_size)
|
||||
|
||||
for batch_idx in range(batches_per_epoch):
|
||||
# Sample random batch
|
||||
batch_indices = np.random.randint(0, len(dataset), size=batch_size)
|
||||
|
||||
batch_inputs = []
|
||||
batch_targets = []
|
||||
|
||||
for idx in batch_indices:
|
||||
input_seq, target_seq = dataset[int(idx)]
|
||||
batch_inputs.append(input_seq)
|
||||
batch_targets.append(target_seq)
|
||||
|
||||
# Convert to tensors: (batch, seq_len)
|
||||
batch_input = Tensor(np.array(batch_inputs))
|
||||
batch_target = Tensor(np.array(batch_targets))
|
||||
|
||||
# Forward pass
|
||||
logits = model.forward(batch_input)
|
||||
|
||||
# Reshape for loss computation: (batch, seq, vocab) → (batch*seq, vocab)
|
||||
# IMPORTANT: Use Tensor.reshape() to preserve computation graph!
|
||||
batch_size_actual, seq_length, vocab_size = logits.shape
|
||||
logits_2d = logits.reshape(batch_size_actual * seq_length, vocab_size)
|
||||
targets_1d = batch_target.reshape(-1)
|
||||
|
||||
# Compute loss
|
||||
loss = criterion.forward(logits_2d, targets_1d)
|
||||
|
||||
# Backward pass
|
||||
loss.backward()
|
||||
|
||||
# Update parameters
|
||||
optimizer.step()
|
||||
|
||||
# Zero gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Track loss
|
||||
batch_loss = float(loss.data)
|
||||
epoch_loss += batch_loss
|
||||
num_batches += 1
|
||||
|
||||
# Log progress
|
||||
if (batch_idx + 1) % log_interval == 0 or batch_idx == 0:
|
||||
avg_loss = epoch_loss / num_batches
|
||||
elapsed = time.time() - start_time
|
||||
console.print(
|
||||
f" Epoch {epoch+1}/{epochs} | "
|
||||
f"Batch {batch_idx+1}/{batches_per_epoch} | "
|
||||
f"Loss: {batch_loss:.4f} | "
|
||||
f"Avg: {avg_loss:.4f} | "
|
||||
f"Time: {elapsed:.1f}s"
|
||||
)
|
||||
|
||||
# Epoch summary
|
||||
avg_epoch_loss = epoch_loss / num_batches
|
||||
epoch_time = time.time() - epoch_start
|
||||
console.print(
|
||||
f"[green]✓[/green] Epoch {epoch+1}/{epochs} complete | "
|
||||
f"Avg Loss: {avg_epoch_loss:.4f} | "
|
||||
f"Time: {epoch_time:.1f}s"
|
||||
)
|
||||
|
||||
# Test model every 5 epochs to show learning progress
|
||||
if (epoch + 1) % 5 == 0 or epoch == 0 or epoch == epochs - 1:
|
||||
test_model_predictions(model, dataset, test_prompts)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
console.print(f"\n[bold green]✓ Training complete![/bold green]")
|
||||
console.print(f" Total time: {total_time/60:.2f} minutes")
|
||||
|
||||
|
||||
def demo_questions(model, tokenizer):
|
||||
"""
|
||||
Demonstrate the model answering questions.
|
||||
|
||||
Shows how well the model learned from TinyTalks by asking
|
||||
various questions from different difficulty levels.
|
||||
"""
|
||||
console.print("\n" + "=" * 70)
|
||||
console.print("[bold cyan]🤖 TinyBot Demo: Ask Me Questions![/bold cyan]")
|
||||
console.print("=" * 70)
|
||||
|
||||
# Test questions from different levels
|
||||
test_questions = [
|
||||
"Q: Hello!",
|
||||
"Q: What is your name?",
|
||||
"Q: What color is the sky?",
|
||||
"Q: How many legs does a dog have?",
|
||||
"Q: What is 2 plus 3?",
|
||||
"Q: What do you use a pen for?",
|
||||
]
|
||||
|
||||
for question in test_questions:
|
||||
console.print(f"\n[yellow]{question}[/yellow]")
|
||||
|
||||
# Generate answer
|
||||
response = model.generate(tokenizer, prompt=question + "\nA:", max_new_tokens=50, temperature=0.8)
|
||||
|
||||
# Extract just the answer part
|
||||
if "\nA:" in response:
|
||||
answer = response.split("\nA:")[1].split("\n")[0].strip()
|
||||
console.print(f"[green]A: {answer}[/green]")
|
||||
else:
|
||||
console.print(f"[dim]{response}[/dim]")
|
||||
|
||||
console.print("\n" + "=" * 70)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main training pipeline"""
|
||||
parser = argparse.ArgumentParser(description='Train TinyGPT on TinyTalks Q&A')
|
||||
parser.add_argument('--epochs', type=int, default=30, help='Number of training epochs (default: 30)')
|
||||
parser.add_argument('--batch-size', type=int, default=16, help='Batch size (default: 16)')
|
||||
parser.add_argument('--lr', type=float, default=0.001, help='Learning rate (default: 0.001)')
|
||||
parser.add_argument('--seq-length', type=int, default=64, help='Sequence length (default: 64)')
|
||||
parser.add_argument('--embed-dim', type=int, default=96, help='Embedding dimension (default: 96, ~500K params)')
|
||||
parser.add_argument('--num-layers', type=int, default=4, help='Number of transformer layers (default: 4)')
|
||||
parser.add_argument('--num-heads', type=int, default=4, help='Number of attention heads (default: 4)')
|
||||
parser.add_argument('--levels', type=str, default=None, help='Difficulty levels to train on (e.g. "1" or "1,2"). Default: all levels')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse levels argument
|
||||
if args.levels:
|
||||
levels = [int(l.strip()) for l in args.levels.split(',')]
|
||||
else:
|
||||
levels = None
|
||||
|
||||
print_banner()
|
||||
|
||||
# Import TinyTorch components
|
||||
console.print("\n[bold]Importing TinyTorch components...[/bold]")
|
||||
try:
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.text.tokenization import CharTokenizer
|
||||
console.print("[green]✓[/green] All modules imported successfully!")
|
||||
except ImportError as e:
|
||||
console.print(f"[red]✗[/red] Import error: {e}")
|
||||
console.print("\nMake sure you have completed all required modules:")
|
||||
console.print(" - Module 01 (Tensor)")
|
||||
console.print(" - Module 02 (Activations)")
|
||||
console.print(" - Module 03 (Layers)")
|
||||
console.print(" - Module 04 (Losses)")
|
||||
console.print(" - Module 05 (Autograd)")
|
||||
console.print(" - Module 06 (Optimizers)")
|
||||
console.print(" - Module 10 (Tokenization)")
|
||||
console.print(" - Module 11 (Embeddings)")
|
||||
console.print(" - Module 12 (Attention)")
|
||||
console.print(" - Module 13 (Transformers)")
|
||||
return
|
||||
|
||||
# Load TinyTalks dataset
|
||||
console.print("\n[bold]Loading TinyTalks dataset...[/bold]")
|
||||
dataset_path = os.path.join(project_root, "datasets", "tinytalks", "splits", "train.txt")
|
||||
|
||||
if not os.path.exists(dataset_path):
|
||||
console.print(f"[red]✗[/red] Dataset not found: {dataset_path}")
|
||||
console.print("\nPlease generate the dataset first:")
|
||||
console.print(" python datasets/tinytalks/scripts/generate_tinytalks.py")
|
||||
return
|
||||
|
||||
with open(dataset_path, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
|
||||
console.print(f"[green]✓[/green] Loaded dataset from: {os.path.basename(dataset_path)}")
|
||||
console.print(f" File size: {len(text)} characters")
|
||||
|
||||
# Create dataset with level filtering
|
||||
dataset = TinyTalksDataset(text, seq_length=args.seq_length, levels=levels)
|
||||
|
||||
# Set test prompts based on levels
|
||||
if levels and 1 in levels:
|
||||
test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: Hi!"]
|
||||
elif levels and 2 in levels:
|
||||
test_prompts = ["Q: What color is the sky?", "Q: How many legs does a dog have?"]
|
||||
elif levels and 3 in levels:
|
||||
test_prompts = ["Q: What is 2 plus 3?", "Q: What is 5 minus 2?"]
|
||||
else:
|
||||
test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: What color is the sky?"]
|
||||
|
||||
# Initialize model
|
||||
console.print("\n[bold]Initializing TinyGPT model...[/bold]")
|
||||
model = TinyGPT(
|
||||
vocab_size=dataset.tokenizer.vocab_size,
|
||||
embed_dim=args.embed_dim,
|
||||
num_layers=args.num_layers,
|
||||
num_heads=args.num_heads,
|
||||
max_seq_len=args.seq_length,
|
||||
dropout=0.1
|
||||
)
|
||||
|
||||
# Initialize optimizer and loss
|
||||
console.print("\n[bold]Initializing training components...[/bold]")
|
||||
optimizer = Adam(model.parameters(), lr=args.lr)
|
||||
criterion = CrossEntropyLoss()
|
||||
console.print(f"[green]✓[/green] Optimizer: Adam (lr={args.lr})")
|
||||
console.print(f"[green]✓[/green] Loss: CrossEntropyLoss")
|
||||
|
||||
# Print configuration
|
||||
table = Table(title="Training Configuration", box=box.ROUNDED)
|
||||
table.add_column("Parameter", style="cyan")
|
||||
table.add_column("Value", style="green")
|
||||
|
||||
dataset_desc = f"TinyTalks Level(s) {levels}" if levels else "TinyTalks (All Levels)"
|
||||
table.add_row("Dataset", dataset_desc)
|
||||
table.add_row("Vocabulary Size", str(dataset.tokenizer.vocab_size))
|
||||
table.add_row("Model Parameters", f"{model.count_parameters():,}")
|
||||
table.add_row("Epochs", str(args.epochs))
|
||||
table.add_row("Batch Size", str(args.batch_size))
|
||||
table.add_row("Learning Rate", str(args.lr))
|
||||
table.add_row("Sequence Length", str(args.seq_length))
|
||||
table.add_row("Embedding Dim", str(args.embed_dim))
|
||||
table.add_row("Layers", str(args.num_layers))
|
||||
table.add_row("Attention Heads", str(args.num_heads))
|
||||
table.add_row("Expected Time", "3-5 minutes")
|
||||
|
||||
console.print(table)
|
||||
|
||||
# Train model
|
||||
train_tinytalks_gpt(
|
||||
model=model,
|
||||
dataset=dataset,
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
epochs=args.epochs,
|
||||
batch_size=args.batch_size,
|
||||
log_interval=50,
|
||||
test_prompts=test_prompts
|
||||
)
|
||||
|
||||
# Demo Q&A
|
||||
demo_questions(model, dataset.tokenizer)
|
||||
|
||||
# Success message
|
||||
console.print("\n[bold green]🎉 Congratulations![/bold green]")
|
||||
console.print("You've successfully trained a transformer to answer questions!")
|
||||
console.print("\nYou used:")
|
||||
console.print(" ✓ YOUR Tensor implementation (Module 01)")
|
||||
console.print(" ✓ YOUR Activations (Module 02)")
|
||||
console.print(" ✓ YOUR Linear layers (Module 03)")
|
||||
console.print(" ✓ YOUR CrossEntropyLoss (Module 04)")
|
||||
console.print(" ✓ YOUR Autograd system (Module 05)")
|
||||
console.print(" ✓ YOUR Adam optimizer (Module 06)")
|
||||
console.print(" ✓ YOUR CharTokenizer (Module 10)")
|
||||
console.print(" ✓ YOUR Embeddings (Module 11)")
|
||||
console.print(" ✓ YOUR Multi-Head Attention (Module 12)")
|
||||
console.print(" ✓ YOUR Transformer blocks (Module 13)")
|
||||
console.print("\n[bold]This is the foundation of ChatGPT, built by YOU from scratch![/bold]")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,427 +0,0 @@
|
||||
"""
|
||||
TinyTalks Interactive Learning Dashboard
|
||||
=========================================
|
||||
|
||||
Watch a chatbot learn in real-time!
|
||||
|
||||
Students can see:
|
||||
- Loss decreasing over time
|
||||
- Responses improving from gibberish to coherent
|
||||
- Learning progress at multiple checkpoints
|
||||
- Interactive control (pause/continue)
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.models.transformer import GPT
|
||||
from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
|
||||
|
||||
enable_autograd()
|
||||
|
||||
try:
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.live import Live
|
||||
from rich.layout import Layout
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
||||
RICH_AVAILABLE = True
|
||||
except ImportError:
|
||||
RICH_AVAILABLE = False
|
||||
print("Note: Install 'rich' for better visualization: pip install rich")
|
||||
|
||||
# ============================================================================
|
||||
# Tokenization (copied from tinytalks_chatbot.py)
|
||||
# ============================================================================
|
||||
|
||||
def create_tokenizer(conversations):
|
||||
"""Create character-level tokenizer with special tokens."""
|
||||
all_text = ' '.join([q + ' ' + a for q, a in conversations])
|
||||
all_chars = sorted(set(all_text))
|
||||
|
||||
special_tokens = {
|
||||
'<PAD>': 0,
|
||||
'<SOS>': 1,
|
||||
'<SEP>': 2,
|
||||
'<EOS>': 3,
|
||||
}
|
||||
|
||||
char_to_idx = {**special_tokens}
|
||||
idx_to_char = {v: k for k, v in special_tokens.items()}
|
||||
|
||||
for idx, char in enumerate(all_chars, start=len(special_tokens)):
|
||||
char_to_idx[char] = idx
|
||||
idx_to_char[idx] = char
|
||||
|
||||
return char_to_idx, idx_to_char
|
||||
|
||||
|
||||
def encode_conversation(question, answer, char_to_idx, max_len=80):
|
||||
"""Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>..."""
|
||||
tokens = [char_to_idx['<SOS>']]
|
||||
|
||||
for c in question:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
|
||||
tokens.append(char_to_idx['<SEP>'])
|
||||
|
||||
for c in answer:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
|
||||
tokens.append(char_to_idx['<EOS>'])
|
||||
|
||||
if len(tokens) < max_len:
|
||||
tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
|
||||
else:
|
||||
tokens = tokens[:max_len]
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def decode_tokens(tokens, idx_to_char):
|
||||
"""Decode tokens to string."""
|
||||
chars = []
|
||||
for t in tokens:
|
||||
if t == 0 or t == 1: # PAD or SOS
|
||||
continue
|
||||
elif t == 2: # SEP
|
||||
continue
|
||||
elif t == 3: # EOS
|
||||
break
|
||||
else:
|
||||
chars.append(idx_to_char.get(t, '?'))
|
||||
return ''.join(chars)
|
||||
|
||||
|
||||
def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
|
||||
"""Generate response to a question."""
|
||||
tokens = [char_to_idx['<SOS>']]
|
||||
for c in question:
|
||||
tokens.append(char_to_idx.get(c, 0))
|
||||
tokens.append(char_to_idx['<SEP>'])
|
||||
|
||||
generated_tokens = []
|
||||
for _ in range(max_len):
|
||||
input_tokens = tokens + generated_tokens
|
||||
while len(input_tokens) < 80:
|
||||
input_tokens.append(char_to_idx['<PAD>'])
|
||||
input_tokens = input_tokens[:80]
|
||||
|
||||
x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
|
||||
logits = model.forward(x)
|
||||
|
||||
next_pos = len(tokens) + len(generated_tokens) - 1
|
||||
if next_pos < logits.shape[1]:
|
||||
next_logits = logits.data[0, next_pos, :]
|
||||
next_token = int(np.argmax(next_logits))
|
||||
|
||||
if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
|
||||
break
|
||||
|
||||
generated_tokens.append(next_token)
|
||||
else:
|
||||
break
|
||||
|
||||
response = decode_tokens(generated_tokens, idx_to_char)
|
||||
return response
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Interactive Training with Checkpoints
|
||||
# ============================================================================
|
||||
|
||||
def evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char):
|
||||
"""Evaluate model on test questions."""
|
||||
results = []
|
||||
for question in test_questions:
|
||||
response = generate_response(model, question, char_to_idx, idx_to_char)
|
||||
results.append((question, response))
|
||||
return results
|
||||
|
||||
|
||||
def show_checkpoint_panel(checkpoint_num, step, loss, results, prev_results=None):
|
||||
"""Show checkpoint results in a nice panel."""
|
||||
if RICH_AVAILABLE:
|
||||
console = Console()
|
||||
|
||||
# Header
|
||||
console.print()
|
||||
console.print("=" * 70, style="bold cyan")
|
||||
console.print(f"CHECKPOINT {checkpoint_num} - Step {step:,} | Loss: {loss:.4f}",
|
||||
style="bold yellow", justify="center")
|
||||
console.print("=" * 70, style="bold cyan")
|
||||
console.print()
|
||||
|
||||
# Show responses
|
||||
table = Table(show_header=True, header_style="bold magenta")
|
||||
table.add_column("Question", style="cyan", width=25)
|
||||
table.add_column("Response", style="green", width=35)
|
||||
if prev_results:
|
||||
table.add_column("Previous", style="dim", width=10)
|
||||
|
||||
for i, (question, response) in enumerate(results):
|
||||
if prev_results and i < len(prev_results):
|
||||
prev_response = prev_results[i][1]
|
||||
improved = "📈" if len(response) > len(prev_response) else "📉"
|
||||
table.add_row(question, response, improved)
|
||||
else:
|
||||
table.add_row(question, response)
|
||||
|
||||
console.print(table)
|
||||
console.print()
|
||||
else:
|
||||
# Fallback to simple print
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(f"CHECKPOINT {checkpoint_num} - Step {step:,} | Loss: {loss:.4f}")
|
||||
print("=" * 70)
|
||||
print()
|
||||
for question, response in results:
|
||||
print(f"Q: {question}")
|
||||
print(f"A: {response}")
|
||||
print()
|
||||
|
||||
|
||||
def train_interactive(model, optimizer, loss_fn, train_data, test_questions,
|
||||
char_to_idx, idx_to_char, max_time_minutes=15,
|
||||
checkpoint_steps=1000, auto_continue_seconds=10):
|
||||
"""
|
||||
Train with interactive checkpoints.
|
||||
|
||||
Args:
|
||||
checkpoint_steps: Pause every N steps to show results
|
||||
auto_continue_seconds: Auto-continue after N seconds (0 = wait for ENTER)
|
||||
"""
|
||||
max_time_seconds = max_time_minutes * 60
|
||||
|
||||
print("=" * 70)
|
||||
print(f"INTERACTIVE TRAINING - {max_time_minutes} MINUTES")
|
||||
print("=" * 70)
|
||||
print(f"Dataset: {len(train_data)} conversations")
|
||||
print(f"Checkpoints: Every {checkpoint_steps} steps")
|
||||
print(f"Auto-continue: {auto_continue_seconds}s (or press ENTER)")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print("Watch the model learn from gibberish to coherent responses!")
|
||||
print()
|
||||
|
||||
# Initial evaluation (before training)
|
||||
print("Evaluating initial model (untrained)...")
|
||||
initial_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
|
||||
show_checkpoint_panel(0, 0, 999.9, initial_results)
|
||||
|
||||
if auto_continue_seconds > 0:
|
||||
print(f"Starting training in {auto_continue_seconds} seconds (or press ENTER)...")
|
||||
time.sleep(auto_continue_seconds)
|
||||
elif auto_continue_seconds == 0:
|
||||
print("Starting training immediately...")
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
input("Press ENTER to start training...")
|
||||
|
||||
print()
|
||||
print("Training started...")
|
||||
print()
|
||||
|
||||
start_time = time.time()
|
||||
losses = []
|
||||
step = 0
|
||||
checkpoint_num = 1
|
||||
prev_results = initial_results
|
||||
|
||||
next_checkpoint = checkpoint_steps
|
||||
|
||||
while True:
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed >= max_time_seconds:
|
||||
break
|
||||
|
||||
# Training step
|
||||
tokens = train_data[np.random.randint(len(train_data))]
|
||||
input_seq = tokens[:-1]
|
||||
target_seq = tokens[1:]
|
||||
|
||||
x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
|
||||
y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
|
||||
|
||||
logits = model.forward(x)
|
||||
|
||||
batch_size, seq_len, vocab_size = logits.shape
|
||||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
|
||||
targets_flat = y_true.reshape(batch_size * seq_len)
|
||||
loss = loss_fn.forward(logits_flat, targets_flat)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
np.clip(param.grad, -1.0, 1.0, out=param.grad)
|
||||
|
||||
optimizer.step()
|
||||
|
||||
losses.append(loss.data.item())
|
||||
step += 1
|
||||
|
||||
# Show progress every 100 steps
|
||||
if step % 100 == 0:
|
||||
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
print(f"[{int(elapsed):4d}s] Step {step:5d} | Loss: {avg_loss:.4f}")
|
||||
|
||||
# Checkpoint evaluation
|
||||
if step >= next_checkpoint:
|
||||
avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
|
||||
print()
|
||||
print(f"Evaluating at step {step}...")
|
||||
current_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
|
||||
|
||||
show_checkpoint_panel(checkpoint_num, step, avg_loss, current_results, prev_results)
|
||||
|
||||
prev_results = current_results
|
||||
checkpoint_num += 1
|
||||
next_checkpoint += checkpoint_steps
|
||||
|
||||
# Interactive pause
|
||||
if auto_continue_seconds > 0:
|
||||
print(f"Continuing in {auto_continue_seconds}s (or press ENTER)...")
|
||||
time.sleep(auto_continue_seconds)
|
||||
elif auto_continue_seconds == 0:
|
||||
print("Continuing immediately...")
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
input("Press ENTER to continue training...")
|
||||
|
||||
print()
|
||||
print("Training resumed...")
|
||||
print()
|
||||
|
||||
# Final results
|
||||
final_elapsed = time.time() - start_time
|
||||
final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
|
||||
initial_loss = np.mean(losses[:10])
|
||||
improvement = (1 - final_loss / initial_loss) * 100
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("TRAINING COMPLETE!")
|
||||
print("=" * 70)
|
||||
print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.1f} minutes)")
|
||||
print(f"Total steps: {step:,}")
|
||||
print(f"Initial loss: {initial_loss:.4f}")
|
||||
print(f"Final loss: {final_loss:.4f}")
|
||||
print(f"Improvement: {improvement:.1f}%")
|
||||
print()
|
||||
|
||||
# Final evaluation
|
||||
print("Final evaluation...")
|
||||
final_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
|
||||
show_checkpoint_panel("FINAL", step, final_loss, final_results, prev_results)
|
||||
|
||||
return losses, step
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("TINYTALKS INTERACTIVE LEARNING DASHBOARD")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print("Watch a transformer learn to chat in real-time!")
|
||||
print("You'll see responses improve from gibberish to coherent answers.")
|
||||
print()
|
||||
|
||||
# Dataset
|
||||
conversations = create_tinytalks_dataset()
|
||||
stats = get_dataset_stats()
|
||||
|
||||
print(f"Dataset: {stats['total_examples']} examples ({stats['unique_examples']} unique)")
|
||||
print()
|
||||
|
||||
# Tokenizer
|
||||
char_to_idx, idx_to_char = create_tokenizer(conversations)
|
||||
vocab_size = len(idx_to_char)
|
||||
|
||||
# Encode
|
||||
max_seq_len = 80
|
||||
train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
|
||||
|
||||
# Test questions for checkpoints
|
||||
test_questions = [
|
||||
"Hi",
|
||||
"How are you",
|
||||
"What is your name",
|
||||
"What is the sky",
|
||||
"Is grass green",
|
||||
]
|
||||
|
||||
# Model: Ultra-tiny for speed
|
||||
config = {
|
||||
'vocab_size': vocab_size,
|
||||
'embed_dim': 16,
|
||||
'num_layers': 1,
|
||||
'num_heads': 2,
|
||||
'max_seq_len': max_seq_len,
|
||||
}
|
||||
|
||||
model = GPT(**config)
|
||||
num_params = sum(np.prod(p.shape) for p in model.parameters())
|
||||
print(f"Model: {num_params:,} parameters")
|
||||
print()
|
||||
|
||||
# Optimizer
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Settings
|
||||
train_time = 5 # minutes (shorter for demo)
|
||||
checkpoint_steps = 1000 # Evaluate every 1000 steps (~1-2 minutes)
|
||||
auto_continue = 0 # Auto-continue immediately (0 = no wait for demo)
|
||||
|
||||
print(f"Training for {train_time} minutes")
|
||||
print(f"Checkpoints every {checkpoint_steps} steps")
|
||||
print()
|
||||
|
||||
# Train with interactive checkpoints
|
||||
losses, total_steps = train_interactive(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
loss_fn=loss_fn,
|
||||
train_data=train_data,
|
||||
test_questions=test_questions,
|
||||
char_to_idx=char_to_idx,
|
||||
idx_to_char=idx_to_char,
|
||||
max_time_minutes=train_time,
|
||||
checkpoint_steps=checkpoint_steps,
|
||||
auto_continue_seconds=auto_continue
|
||||
)
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("DEMO COMPLETE!")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print("You just watched a transformer learn from scratch!")
|
||||
print(f"✓ {total_steps:,} training steps")
|
||||
print(f"✓ {len(losses)} loss values")
|
||||
print(f"✓ {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}% improvement")
|
||||
print()
|
||||
print("Key takeaway: Loss decrease = Better responses!")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,336 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Monitored Training Script for TinyTalks
|
||||
========================================
|
||||
|
||||
Features:
|
||||
- Early stopping if loss doesn't improve
|
||||
- Continuous progress monitoring
|
||||
- Automatic experiment termination for bad runs
|
||||
- Clear feedback on learning progress
|
||||
|
||||
Usage:
|
||||
python train_monitored.py --mode test # 10 epochs, quick validation
|
||||
python train_monitored.py --mode full # 30 epochs, full training
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import time
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from rich.console import Console
|
||||
from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
# Import TinyTorch components
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.text.tokenization import CharTokenizer
|
||||
|
||||
console = Console()
|
||||
|
||||
# Import TinyGPT and dataset classes
|
||||
exec(open(project_root / "milestones/05_2017_transformer/tinytalks_gpt.py").read())
|
||||
|
||||
|
||||
class TrainingMonitor:
|
||||
"""Monitor training progress and implement early stopping"""
|
||||
|
||||
def __init__(self, patience=5, min_delta=0.01):
|
||||
"""
|
||||
Args:
|
||||
patience: Number of checks without improvement before stopping
|
||||
min_delta: Minimum change in loss to count as improvement
|
||||
"""
|
||||
self.patience = patience
|
||||
self.min_delta = min_delta
|
||||
self.best_loss = float('inf')
|
||||
self.checks_without_improvement = 0
|
||||
self.losses = []
|
||||
|
||||
def check(self, current_loss):
|
||||
"""
|
||||
Check if training should continue
|
||||
|
||||
Returns:
|
||||
(should_continue, message)
|
||||
"""
|
||||
self.losses.append(current_loss)
|
||||
|
||||
# Calculate improvement
|
||||
improvement = self.best_loss - current_loss
|
||||
|
||||
if improvement > self.min_delta:
|
||||
# Significant improvement
|
||||
self.best_loss = current_loss
|
||||
self.checks_without_improvement = 0
|
||||
return True, f"✓ Loss improved by {improvement:.4f}"
|
||||
else:
|
||||
# No significant improvement
|
||||
self.checks_without_improvement += 1
|
||||
|
||||
if self.checks_without_improvement >= self.patience:
|
||||
return False, f"✗ No improvement for {self.patience} checks. Stopping."
|
||||
else:
|
||||
return True, f"⚠ No improvement ({self.checks_without_improvement}/{self.patience})"
|
||||
|
||||
def summary(self):
|
||||
"""Get training summary"""
|
||||
if len(self.losses) < 2:
|
||||
return "Not enough data"
|
||||
|
||||
initial = self.losses[0]
|
||||
final = self.losses[-1]
|
||||
best = min(self.losses)
|
||||
decrease = initial - final
|
||||
decrease_pct = (decrease / initial) * 100 if initial > 0 else 0
|
||||
|
||||
return {
|
||||
'initial_loss': initial,
|
||||
'final_loss': final,
|
||||
'best_loss': best,
|
||||
'total_decrease': decrease,
|
||||
'decrease_percent': decrease_pct,
|
||||
'num_checks': len(self.losses)
|
||||
}
|
||||
|
||||
|
||||
def train_with_monitoring(model, dataset, optimizer, criterion, config, monitor):
|
||||
"""
|
||||
Train with continuous monitoring and early stopping
|
||||
|
||||
Args:
|
||||
model: TinyGPT model
|
||||
dataset: TinyTalksDataset
|
||||
optimizer: Adam optimizer
|
||||
criterion: CrossEntropyLoss
|
||||
config: Training configuration dict
|
||||
monitor: TrainingMonitor instance
|
||||
|
||||
Returns:
|
||||
success: True if training completed successfully
|
||||
"""
|
||||
epochs = config['epochs']
|
||||
batch_size = config['batch_size']
|
||||
check_interval = config.get('check_interval', 50) # Check every N batches
|
||||
|
||||
console.print(f"\n[bold cyan]Starting Training with Monitoring[/bold cyan]")
|
||||
console.print(f" Check interval: Every {check_interval} batches")
|
||||
console.print(f" Early stopping: {monitor.patience} checks without improvement\n")
|
||||
|
||||
total_batches_processed = 0
|
||||
start_time = time.time()
|
||||
|
||||
for epoch in range(epochs):
|
||||
epoch_start = time.time()
|
||||
epoch_loss = 0.0
|
||||
batch_count = 0
|
||||
|
||||
console.print(f"[bold]Epoch {epoch+1}/{epochs}[/bold]")
|
||||
|
||||
# Create batches
|
||||
num_sequences = len(dataset)
|
||||
indices = np.random.permutation(num_sequences)
|
||||
|
||||
for batch_start in range(0, num_sequences, batch_size):
|
||||
batch_end = min(batch_start + batch_size, num_sequences)
|
||||
batch_indices = indices[batch_start:batch_end]
|
||||
|
||||
# Get batch data
|
||||
batch_inputs = []
|
||||
batch_targets = []
|
||||
for idx in batch_indices:
|
||||
input_seq, target_seq = dataset[idx]
|
||||
batch_inputs.append(input_seq)
|
||||
batch_targets.append(target_seq)
|
||||
|
||||
# Convert to tensors
|
||||
batch_input = Tensor(np.array(batch_inputs))
|
||||
batch_target = Tensor(np.array(batch_targets))
|
||||
|
||||
# Forward pass
|
||||
logits = model.forward(batch_input)
|
||||
|
||||
# Reshape for loss
|
||||
batch_size_actual, seq_length, vocab_size = logits.shape
|
||||
logits_2d = logits.reshape(batch_size_actual * seq_length, vocab_size)
|
||||
targets_1d = batch_target.reshape(-1)
|
||||
|
||||
# Compute loss
|
||||
loss = criterion.forward(logits_2d, targets_1d)
|
||||
|
||||
# Backward and optimize
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Track loss
|
||||
loss_value = float(loss.data)
|
||||
epoch_loss += loss_value
|
||||
batch_count += 1
|
||||
total_batches_processed += 1
|
||||
|
||||
# Monitor progress at check intervals
|
||||
if total_batches_processed % check_interval == 0:
|
||||
avg_loss = epoch_loss / batch_count
|
||||
should_continue, message = monitor.check(avg_loss)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
console.print(f" Batch {total_batches_processed} | Loss: {avg_loss:.4f} | {message} | Time: {elapsed:.1f}s")
|
||||
|
||||
if not should_continue:
|
||||
console.print(f"\n[yellow]Early stopping triggered at epoch {epoch+1}, batch {batch_count}[/yellow]")
|
||||
return False
|
||||
|
||||
# Epoch summary
|
||||
avg_epoch_loss = epoch_loss / batch_count
|
||||
epoch_time = time.time() - epoch_start
|
||||
console.print(f" → Epoch {epoch+1} complete: Avg Loss = {avg_epoch_loss:.4f} | Time: {epoch_time:.1f}s\n")
|
||||
|
||||
console.print(f"[green]✓ Training completed successfully![/green]\n")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Monitored TinyTalks Training')
|
||||
parser.add_argument('--mode', choices=['test', 'full'], default='test',
|
||||
help='Training mode: test (10 epochs) or full (30 epochs)')
|
||||
parser.add_argument('--patience', type=int, default=5,
|
||||
help='Early stopping patience (checks without improvement)')
|
||||
parser.add_argument('--min-delta', type=float, default=0.01,
|
||||
help='Minimum loss decrease to count as improvement')
|
||||
parser.add_argument('--check-interval', type=int, default=50,
|
||||
help='Check progress every N batches')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Enable autograd
|
||||
enable_autograd()
|
||||
|
||||
# Configuration based on mode
|
||||
if args.mode == 'test':
|
||||
config = {
|
||||
'epochs': 10,
|
||||
'batch_size': 32,
|
||||
'lr': 0.001,
|
||||
'embed_dim': 128,
|
||||
'num_layers': 6,
|
||||
'num_heads': 8,
|
||||
'check_interval': args.check_interval,
|
||||
'mode': 'TEST (Quick Validation)'
|
||||
}
|
||||
else: # full
|
||||
config = {
|
||||
'epochs': 30,
|
||||
'batch_size': 32,
|
||||
'lr': 0.001,
|
||||
'embed_dim': 128,
|
||||
'num_layers': 6,
|
||||
'num_heads': 8,
|
||||
'check_interval': args.check_interval,
|
||||
'mode': 'FULL (Complete Training)'
|
||||
}
|
||||
|
||||
# Display configuration
|
||||
console.print("\n[bold cyan]═══════════════════════════════════════════════════[/bold cyan]")
|
||||
console.print("[bold cyan] Monitored TinyTalks Training - Option C [/bold cyan]")
|
||||
console.print("[bold cyan]═══════════════════════════════════════════════════[/bold cyan]\n")
|
||||
|
||||
table = Table(box=box.ROUNDED)
|
||||
table.add_column("Parameter", style="cyan")
|
||||
table.add_column("Value", style="yellow")
|
||||
|
||||
table.add_row("Mode", config['mode'])
|
||||
table.add_row("Epochs", str(config['epochs']))
|
||||
table.add_row("Batch Size", str(config['batch_size']))
|
||||
table.add_row("Learning Rate", str(config['lr']))
|
||||
table.add_row("Model Size", f"{config['embed_dim']}d, {config['num_layers']}L, {config['num_heads']}H")
|
||||
table.add_row("Early Stopping Patience", str(args.patience))
|
||||
table.add_row("Min Delta", str(args.min_delta))
|
||||
table.add_row("Check Interval", f"Every {args.check_interval} batches")
|
||||
|
||||
console.print(table)
|
||||
console.print()
|
||||
|
||||
# Load dataset
|
||||
console.print("[bold]Loading TinyTalks dataset...[/bold]")
|
||||
dataset_path = project_root / "datasets/tinytalks/splits/train.txt"
|
||||
with open(dataset_path, 'r') as f:
|
||||
text = f.read()
|
||||
|
||||
dataset = TinyTalksDataset(text, seq_length=64)
|
||||
console.print(f" ✓ Loaded: {len(text):,} chars, {dataset.tokenizer.vocab_size} vocab\n")
|
||||
|
||||
# Initialize model
|
||||
console.print("[bold]Initializing model...[/bold]")
|
||||
model = TinyGPT(
|
||||
vocab_size=dataset.tokenizer.vocab_size,
|
||||
embed_dim=config['embed_dim'],
|
||||
num_layers=config['num_layers'],
|
||||
num_heads=config['num_heads'],
|
||||
max_seq_len=64
|
||||
)
|
||||
|
||||
params = model.parameters()
|
||||
param_count = sum(p.data.size for p in params)
|
||||
console.print(f" ✓ Model initialized: {param_count:,} parameters\n")
|
||||
|
||||
# Initialize training components
|
||||
optimizer = Adam(params, lr=config['lr'])
|
||||
criterion = CrossEntropyLoss()
|
||||
monitor = TrainingMonitor(patience=args.patience, min_delta=args.min_delta)
|
||||
|
||||
# Train
|
||||
console.print("[bold]Starting training...[/bold]\n")
|
||||
start_time = time.time()
|
||||
|
||||
success = train_with_monitoring(model, dataset, optimizer, criterion, config, monitor)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Summary
|
||||
console.print("\n[bold cyan]═══════════════════════════════════════════════════[/bold cyan]")
|
||||
console.print("[bold cyan] Training Summary [/bold cyan]")
|
||||
console.print("[bold cyan]═══════════════════════════════════════════════════[/bold cyan]\n")
|
||||
|
||||
summary = monitor.summary()
|
||||
|
||||
result_table = Table(box=box.ROUNDED)
|
||||
result_table.add_column("Metric", style="cyan")
|
||||
result_table.add_column("Value", style="yellow")
|
||||
|
||||
result_table.add_row("Status", "✓ SUCCESS" if success else "⚠ EARLY STOP")
|
||||
result_table.add_row("Total Time", f"{total_time/60:.1f} minutes")
|
||||
result_table.add_row("Initial Loss", f"{summary['initial_loss']:.4f}")
|
||||
result_table.add_row("Final Loss", f"{summary['final_loss']:.4f}")
|
||||
result_table.add_row("Best Loss", f"{summary['best_loss']:.4f}")
|
||||
result_table.add_row("Total Decrease", f"{summary['total_decrease']:.4f} ({summary['decrease_percent']:.1f}%)")
|
||||
result_table.add_row("Checks Performed", str(summary['num_checks']))
|
||||
|
||||
console.print(result_table)
|
||||
console.print()
|
||||
|
||||
# Recommendation
|
||||
if success and summary['decrease_percent'] > 50:
|
||||
console.print("[bold green]✓ EXCELLENT: Model is learning well! Continue with full training.[/bold green]")
|
||||
elif success and summary['decrease_percent'] > 20:
|
||||
console.print("[bold yellow]⚠ MODERATE: Model is learning but slowly. Consider tuning hyperparameters.[/bold yellow]")
|
||||
elif success:
|
||||
console.print("[bold red]✗ POOR: Model not learning effectively. Needs hyperparameter adjustment.[/bold red]")
|
||||
else:
|
||||
console.print("[bold red]✗ FAILED: Training stopped early. Try different hyperparameters.[/bold red]")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user