From bcbcbf7f3dfd0177396d0ee30e35e02b33693511 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Wed, 28 Jan 2026 18:36:16 -0500 Subject: [PATCH] fix(milestone): reset model before mixed task challenge The transformer was being trained sequentially on reversal then copying, which caused it to "forget" reversal before the mixed task. Now we reinitialize the model before challenge 3 so it learns both tasks together with proper prefix conditioning. --- .../05_2017_transformer/01_vaswani_attention.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tinytorch/milestones/05_2017_transformer/01_vaswani_attention.py b/tinytorch/milestones/05_2017_transformer/01_vaswani_attention.py index cbe58637d..1041b69fc 100755 --- a/tinytorch/milestones/05_2017_transformer/01_vaswani_attention.py +++ b/tinytorch/milestones/05_2017_transformer/01_vaswani_attention.py @@ -544,6 +544,20 @@ def main(): results['copying'] = (passed2, acc2) # Challenge 3: Mixed Tasks (the real test) + # Reset model and optimizer so it learns both tasks from scratch + # with prefix conditioning (otherwise it's stuck in "copy mode") + console.print("\n[dim]Reinitializing model for mixed task learning...[/dim]") + model = AttentionTransformer( + vocab_size=vocab_size, + embed_dim=embed_dim, + num_heads=num_heads, + seq_len=seq_len + 1, + num_layers=num_layers + ) + for param in model.parameters(): + param.requires_grad = True + optimizer = Adam(model.parameters(), lr=lr) + train_mixed = generate_mixed_data(800, seq_len) test_mixed = generate_mixed_data(300, seq_len) passed3, acc3 = run_challenge(