diff --git a/modules/source/12_attention/attention_dev.py b/modules/source/12_attention/attention_dev.py index 7800c15d..79618f6c 100644 --- a/modules/source/12_attention/attention_dev.py +++ b/modules/source/12_attention/attention_dev.py @@ -313,6 +313,7 @@ Step-by-Step Attention Computation: """ # %% nbgrader={"grade": false, "grade_id": "attention-function", "locked": false, "solution": true} +#| export def scaled_dot_product_attention(Q: Tensor, K: Tensor, V: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: """ Compute scaled dot-product attention. @@ -526,6 +527,7 @@ This parallelization allows the model to attend to different representation subs """ # %% nbgrader={"grade": false, "grade_id": "multihead-attention", "locked": false, "solution": true} +#| export class MultiHeadAttention: """ Multi-head attention mechanism. diff --git a/modules/source/13_transformers/transformers_dev.py b/modules/source/13_transformers/transformers_dev.py index b068915a..ee34304f 100644 --- a/modules/source/13_transformers/transformers_dev.py +++ b/modules/source/13_transformers/transformers_dev.py @@ -853,6 +853,7 @@ Each layer adds information to this stream rather than replacing it, creating a """ # %% nbgrader={"grade": false, "grade_id": "transformer-block", "solution": true} +#| export class TransformerBlock: """ Complete Transformer Block with self-attention, MLP, and residual connections.