mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-05 09:33:15 -05:00
- Removed temporary test files and audit reports - Deleted backup and temp_holding directories - Reorganized module structure (07->09 spatial, 09->07 dataloader) - Added new modules: 11-14 (tokenization, embeddings, attention, transformers) - Updated examples with historical ML milestones - Cleaned up documentation structure
35 lines
1.3 KiB
YAML
35 lines
1.3 KiB
YAML
name: "Transformers"
|
|
number: 14
|
|
description: "Complete transformer architecture with LayerNorm, transformer blocks, and language model implementation"
|
|
learning_objectives:
|
|
- "Implement LayerNorm for stable deep network training"
|
|
- "Build position-wise feed-forward networks for transformer blocks"
|
|
- "Create complete transformer blocks with attention, normalization, and residual connections"
|
|
- "Develop full transformer models with embeddings, multiple layers, and generation capability"
|
|
- "Understand transformer scaling characteristics and production deployment considerations"
|
|
|
|
prerequisites:
|
|
- "02_tensor"
|
|
- "12_embeddings"
|
|
- "13_attention"
|
|
|
|
exports:
|
|
- "LayerNorm"
|
|
- "PositionwiseFeedForward"
|
|
- "TransformerBlock"
|
|
- "Transformer"
|
|
- "TransformerProfiler"
|
|
|
|
systems_concepts:
|
|
- "Linear memory scaling with transformer depth"
|
|
- "Layer normalization vs batch normalization trade-offs"
|
|
- "Residual connection gradient flow optimization"
|
|
- "Parameter allocation across depth, width, and attention heads"
|
|
- "Training memory vs inference memory requirements"
|
|
|
|
ml_systems_focus: "Transformer architecture optimization, memory scaling with depth, production deployment strategies"
|
|
|
|
estimated_time: "6-7 hours"
|
|
|
|
next_modules:
|
|
- "Advanced transformer architectures and optimization techniques" |