mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 22:25:29 -05:00
Reset package and export modules 01-07 only (skip broken spatial module)
This commit is contained in:
148
tinytorch/models/transformer.py
generated
148
tinytorch/models/transformer.py
generated
@@ -1,148 +0,0 @@
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Tensor', 'Linear', 'MultiHeadAttention', 'Embedding', 'gelu']
|
||||
|
||||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 1
|
||||
import numpy as np
|
||||
import math
|
||||
from typing import Optional, List
|
||||
|
||||
# Minimal implementations for development - in practice these import from previous modules
|
||||
class Tensor:
|
||||
"""Minimal Tensor class for transformer development - imports from Module 01 in practice."""
|
||||
def __init__(self, data, requires_grad=False):
|
||||
self.data = np.array(data)
|
||||
self.shape = self.data.shape
|
||||
self.size = self.data.size
|
||||
self.requires_grad = requires_grad
|
||||
self.grad = None
|
||||
|
||||
def __add__(self, other):
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data + other.data)
|
||||
return Tensor(self.data + other)
|
||||
|
||||
def __mul__(self, other):
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data * other.data)
|
||||
return Tensor(self.data * other)
|
||||
|
||||
def matmul(self, other):
|
||||
return Tensor(np.dot(self.data, other.data))
|
||||
|
||||
def sum(self, axis=None, keepdims=False):
|
||||
return Tensor(self.data.sum(axis=axis, keepdims=keepdims))
|
||||
|
||||
def mean(self, axis=None, keepdims=False):
|
||||
return Tensor(self.data.mean(axis=axis, keepdims=keepdims))
|
||||
|
||||
def reshape(self, *shape):
|
||||
return Tensor(self.data.reshape(shape))
|
||||
|
||||
def __repr__(self):
|
||||
return f"Tensor(data={self.data}, shape={self.shape})"
|
||||
|
||||
class Linear:
|
||||
"""Minimal Linear layer - imports from Module 03 in practice."""
|
||||
def __init__(self, in_features, out_features, bias=True):
|
||||
# Xavier/Glorot initialization
|
||||
std = math.sqrt(2.0 / (in_features + out_features))
|
||||
self.weight = Tensor(np.random.normal(0, std, (in_features, out_features)))
|
||||
self.bias = Tensor(np.zeros(out_features)) if bias else None
|
||||
|
||||
def forward(self, x):
|
||||
output = x.matmul(self.weight)
|
||||
if self.bias is not None:
|
||||
output = output + self.bias
|
||||
return output
|
||||
|
||||
def parameters(self):
|
||||
params = [self.weight]
|
||||
if self.bias is not None:
|
||||
params.append(self.bias)
|
||||
return params
|
||||
|
||||
class MultiHeadAttention:
|
||||
"""Minimal MultiHeadAttention - imports from Module 12 in practice."""
|
||||
def __init__(self, embed_dim, num_heads):
|
||||
assert embed_dim % num_heads == 0
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = embed_dim // num_heads
|
||||
|
||||
self.q_proj = Linear(embed_dim, embed_dim)
|
||||
self.k_proj = Linear(embed_dim, embed_dim)
|
||||
self.v_proj = Linear(embed_dim, embed_dim)
|
||||
self.out_proj = Linear(embed_dim, embed_dim)
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
batch_size, seq_len, embed_dim = x.shape
|
||||
|
||||
# Linear projections
|
||||
Q = self.q_proj.forward(x)
|
||||
K = self.k_proj.forward(x)
|
||||
V = self.v_proj.forward(x)
|
||||
|
||||
# Reshape for multi-head attention
|
||||
Q = Q.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
|
||||
K = K.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
|
||||
V = V.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
|
||||
|
||||
# Transpose to (batch_size, num_heads, seq_len, head_dim)
|
||||
Q = Tensor(np.transpose(Q.data, (0, 2, 1, 3)))
|
||||
K = Tensor(np.transpose(K.data, (0, 2, 1, 3)))
|
||||
V = Tensor(np.transpose(V.data, (0, 2, 1, 3)))
|
||||
|
||||
# Scaled dot-product attention
|
||||
scores = Tensor(np.matmul(Q.data, np.transpose(K.data, (0, 1, 3, 2))))
|
||||
scores = scores * (1.0 / math.sqrt(self.head_dim))
|
||||
|
||||
# Apply causal mask for autoregressive generation
|
||||
if mask is not None:
|
||||
scores = Tensor(scores.data + mask.data)
|
||||
|
||||
# Softmax
|
||||
attention_weights = self._softmax(scores)
|
||||
|
||||
# Apply attention to values
|
||||
out = Tensor(np.matmul(attention_weights.data, V.data))
|
||||
|
||||
# Transpose back and reshape
|
||||
out = Tensor(np.transpose(out.data, (0, 2, 1, 3)))
|
||||
out = out.reshape(batch_size, seq_len, embed_dim)
|
||||
|
||||
# Final linear projection
|
||||
return self.out_proj.forward(out)
|
||||
|
||||
def _softmax(self, x):
|
||||
"""Numerically stable softmax."""
|
||||
exp_x = Tensor(np.exp(x.data - np.max(x.data, axis=-1, keepdims=True)))
|
||||
return Tensor(exp_x.data / np.sum(exp_x.data, axis=-1, keepdims=True))
|
||||
|
||||
def parameters(self):
|
||||
params = []
|
||||
params.extend(self.q_proj.parameters())
|
||||
params.extend(self.k_proj.parameters())
|
||||
params.extend(self.v_proj.parameters())
|
||||
params.extend(self.out_proj.parameters())
|
||||
return params
|
||||
|
||||
class Embedding:
|
||||
"""Minimal Embedding layer - imports from Module 11 in practice."""
|
||||
def __init__(self, vocab_size, embed_dim):
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
# Initialize with small random values
|
||||
self.weight = Tensor(np.random.normal(0, 0.02, (vocab_size, embed_dim)))
|
||||
|
||||
def forward(self, indices):
|
||||
# Simple embedding lookup
|
||||
return Tensor(self.weight.data[indices.data])
|
||||
|
||||
def parameters(self):
|
||||
return [self.weight]
|
||||
|
||||
def gelu(x):
|
||||
"""GELU activation function."""
|
||||
return Tensor(0.5 * x.data * (1 + np.tanh(np.sqrt(2 / np.pi) * (x.data + 0.044715 * x.data**3))))
|
||||
Reference in New Issue
Block a user