mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-02 18:36:30 -05:00
- Renamed all module.yaml files to [module_name].yml for consistency - Updated module configuration format and structure - Added new module configurations for all 20 modules - Removed obsolete benchmarking module (20_benchmarking) - Added new capstone module (20_capstone) - Enhanced autograd module with visual examples and improved implementation - Updated optimizers module with latest improvements - Standardized YAML structure across all modules
32 lines
1.0 KiB
YAML
32 lines
1.0 KiB
YAML
name: "Tokenization"
|
|
number: 11
|
|
description: "Text processing systems that convert raw text into numerical sequences for language models"
|
|
learning_objectives:
|
|
- "Implement character-level tokenization with special token handling"
|
|
- "Build BPE (Byte Pair Encoding) tokenizer for subword units"
|
|
- "Understand tokenization trade-offs: vocabulary size vs sequence length"
|
|
- "Optimize tokenization performance for production systems"
|
|
- "Analyze how tokenization affects model memory and training efficiency"
|
|
|
|
prerequisites:
|
|
- "02_tensor"
|
|
|
|
exports:
|
|
- "CharTokenizer"
|
|
- "BPETokenizer"
|
|
- "TokenizationProfiler"
|
|
- "OptimizedTokenizer"
|
|
|
|
systems_concepts:
|
|
- "Memory efficiency of token representations"
|
|
- "Vocabulary size vs model size tradeoffs"
|
|
- "Tokenization throughput optimization"
|
|
- "String processing performance"
|
|
- "Cache-friendly text processing patterns"
|
|
|
|
ml_systems_focus: "Text processing pipelines, tokenization throughput, memory-efficient vocabulary management"
|
|
|
|
estimated_time: "4-5 hours"
|
|
|
|
next_modules:
|
|
- "12_embeddings" |