Add sustainable AI and systems citations to future work section

Added citations for sustainable ML, energy-efficient computing, mixed precision training, and TinyML benchmarking to strengthen the future work discussion. New citations: - Strubell et al. (2019): Energy and Policy Considerations for Deep Learning in NLP - foundational work on ML carbon footprint - Patterson et al. (2021): Carbon Emissions and Large Neural Network Training - comprehensive analysis of energy use in large models - Micikevicius et al. (2018): Mixed Precision Training - ICLR paper on FP16/FP32 training techniques - Banbury et al. (2021): Benchmarking TinyML Systems - TinyMLPerf benchmarking framework for edge AI Citations integrated into: - Roofline Models section (mixed precision advantages) - Energy and Power Profiling section (sustainable ML and edge AI) These citations ground the future work proposals in established research on green AI, energy-efficient ML, and edge deployment. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-11 22:33:36 -05:00 · 2025-11-18 17:31:21 -05:00
parent 6191a039f6
commit 9dfa8ae6ae
12 changed files with 339 additions and 305 deletions
--- a/modules/13_transformers/transformers_dev.py
+++ b/modules/13_transformers/transformers_dev.py
@@ -75,160 +75,98 @@ import numpy as np
 import math
 from typing import Optional, List

-# Import from previous modules - following proper dependency chain
-# Note: Actual imports happen in try/except blocks below with fallback implementations
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.layers import Linear
-# MultiHeadAttention import happens in try/except below
+"""
+## 🔗 Module Dependencies

-# For development, we'll use minimal implementations if imports fail
+This module REQUIRES completion of:
+- Module 01 (Tensor): Foundation data structure
+- Module 02 (Activations): GELU activation function
+- Module 03 (Layers): Linear layer for projections
+- Module 11 (Embeddings): Embedding and PositionalEncoding
+- Module 12 (Attention): MultiHeadAttention mechanism
+
+**Progressive Building**:
+```
+Module 01 (Tensor) ──┐
+                     ├──> Module 03 (Layers) ──┐
+Module 02 (Activations) ──┘                    ├──> Module 12 (Attention) ──┐
+                                               │                              ├──> Module 13 (Transformers)
+Module 11 (Embeddings) ───────────────────────┘                              │
+                                                                              │
+Module 02 (GELU) ───────────────────────────────────────────────────────────┘
+```
+
+**What You've Built**:
+- Module 01: Tensor (data structure)
+- Module 02: Activations including GELU
+- Module 03: Linear layers (building blocks)
+- Module 11: Embeddings (token and positional)
+- Module 12: MultiHeadAttention (core mechanism)
+
+**What This Module Adds**:
+- TransformerBlock (combines attention + MLP + normalization)
+- Complete GPT architecture
+- Autoregressive generation
+
+**To verify dependencies are met, run**:
+    python -c "from tinytorch.core.tensor import Tensor; print('✅ Module 01 ready')"
+    python -c "from tinytorch.core.activations import GELU; print('✅ Module 02 ready')"
+    python -c "from tinytorch.core.layers import Linear; print('✅ Module 03 ready')"
+    python -c "from tinytorch.text.embeddings import Embedding; print('✅ Module 11 ready')"
+    python -c "from tinytorch.core.attention import MultiHeadAttention; print('✅ Module 12 ready')"
+"""
+
+# Direct imports from previous modules - these MUST exist
+# If imports fail, students will get clear educational errors
 try:
-    from tinytorch.core.tensor import Tensor
-except ImportError:
-    print("Warning: Using minimal Tensor implementation for development")
-    class Tensor:
-        """Minimal Tensor class for transformer development."""
-        def __init__(self, data, requires_grad=False):
-            self.data = np.array(data)
-            self.shape = self.data.shape
-            self.size = self.data.size
-            self.requires_grad = requires_grad
-            self.grad = None
-
-        def __add__(self, other):
-            if isinstance(other, Tensor):
-                return Tensor(self.data + other.data)
-            return Tensor(self.data + other)
-
-        def __mul__(self, other):
-            if isinstance(other, Tensor):
-                return Tensor(self.data * other.data)
-            return Tensor(self.data * other)
-
-        def matmul(self, other):
-            return Tensor(np.dot(self.data, other.data))
-
-        def sum(self, axis=None, keepdims=False):
-            return Tensor(self.data.sum(axis=axis, keepdims=keepdims))
-
-        def mean(self, axis=None, keepdims=False):
-            return Tensor(self.data.mean(axis=axis, keepdims=keepdims))
-
-        def reshape(self, *shape):
-            return Tensor(self.data.reshape(shape))
-
-        def __repr__(self):
-            return f"Tensor(data={self.data}, shape={self.shape})"
+    from tinytorch.core.tensor import Tensor  # Module 01: Foundation
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 13 (Transformers) requires Module 01 (Tensor) to be completed first.\n"
+        "   This module builds on the Tensor class you created in Module 01.\n"
+        "   Please complete Module 01 first, then run 'tito module complete 01'.\n"
+        "   Original error: " + str(e)
+    ) from e

 try:
-    from tinytorch.core.layers import Linear
-except ImportError:
-    class Linear:
-        """Minimal Linear layer for development."""
-        def __init__(self, in_features, out_features, bias=True):
-            std = math.sqrt(2.0 / (in_features + out_features))
-            self.weight = Tensor(np.random.normal(0, std, (in_features, out_features)))
-            self.bias = Tensor(np.zeros(out_features)) if bias else None
-
-        def forward(self, x):
-            output = x.matmul(self.weight)
-            if self.bias is not None:
-                output = output + self.bias
-            return output
-
-        def parameters(self):
-            params = [self.weight]
-            if self.bias is not None:
-                params.append(self.bias)
-            return params
+    from tinytorch.core.layers import Linear  # Module 03: Building blocks
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 13 (Transformers) requires Module 03 (Layers) to be completed first.\n"
+        "   Transformers use Linear layers for projections.\n"
+        "   Please complete Module 03 first, then run 'tito module complete 03'.\n"
+        "   Original error: " + str(e)
+    ) from e

 try:
-    from tinytorch.core.attention import MultiHeadAttention
-except ImportError:
-    class MultiHeadAttention:
-        """Minimal MultiHeadAttention for development."""
-        def __init__(self, embed_dim, num_heads):
-            assert embed_dim % num_heads == 0
-            self.embed_dim = embed_dim
-            self.num_heads = num_heads
-            self.head_dim = embed_dim // num_heads
-
-            self.q_proj = Linear(embed_dim, embed_dim)
-            self.k_proj = Linear(embed_dim, embed_dim)
-            self.v_proj = Linear(embed_dim, embed_dim)
-            self.out_proj = Linear(embed_dim, embed_dim)
-
-        def forward(self, query, key, value, mask=None):
-            batch_size, seq_len, embed_dim = query.shape
-
-            # Linear projections
-            Q = self.q_proj.forward(query)
-            K = self.k_proj.forward(key)
-            V = self.v_proj.forward(value)
-
-            # Reshape for multi-head attention
-            Q = Q.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
-            K = K.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
-            V = V.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
-
-            # Transpose to (batch_size, num_heads, seq_len, head_dim)
-            Q = Tensor(np.transpose(Q.data, (0, 2, 1, 3)))
-            K = Tensor(np.transpose(K.data, (0, 2, 1, 3)))
-            V = Tensor(np.transpose(V.data, (0, 2, 1, 3)))
-
-            # Scaled dot-product attention
-            scores = Tensor(np.matmul(Q.data, np.transpose(K.data, (0, 1, 3, 2))))
-            scores = scores * (1.0 / math.sqrt(self.head_dim))
-
-            # Apply causal mask for autoregressive generation
-            if mask is not None:
-                scores = Tensor(scores.data + mask.data)
-
-            # Softmax
-            attention_weights = self._softmax(scores)
-
-            # Apply attention to values
-            out = Tensor(np.matmul(attention_weights.data, V.data))
-
-            # Transpose back and reshape
-            out = Tensor(np.transpose(out.data, (0, 2, 1, 3)))
-            out = out.reshape(batch_size, seq_len, embed_dim)
-
-            # Final linear projection
-            return self.out_proj.forward(out)
-
-        def _softmax(self, x):
-            """Numerically stable softmax."""
-            exp_x = Tensor(np.exp(x.data - np.max(x.data, axis=-1, keepdims=True)))
-            return Tensor(exp_x.data / np.sum(exp_x.data, axis=-1, keepdims=True))
-
-        def parameters(self):
-            params = []
-            params.extend(self.q_proj.parameters())
-            params.extend(self.k_proj.parameters())
-            params.extend(self.v_proj.parameters())
-            params.extend(self.out_proj.parameters())
-            return params
+    from tinytorch.core.attention import MultiHeadAttention  # Module 12: Core mechanism
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 13 (Transformers) requires Module 12 (Attention) to be completed first.\n"
+        "   Transformers are built around MultiHeadAttention.\n"
+        "   Please complete Module 12 first, then run 'tito module complete 12'.\n"
+        "   Original error: " + str(e)
+    ) from e

 try:
-    from tinytorch.core.embeddings import Embedding
-except ImportError:
-    class Embedding:
-        """Minimal Embedding layer for development."""
-        def __init__(self, vocab_size, embed_dim):
-            self.vocab_size = vocab_size
-            self.embed_dim = embed_dim
-            self.weight = Tensor(np.random.normal(0, 0.02, (vocab_size, embed_dim)))
+    from tinytorch.core.activations import GELU  # Module 02: Activation function
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 13 (Transformers) requires Module 02 (Activations) to be completed first.\n"
+        "   Transformers use GELU activation in MLP layers.\n"
+        "   Please complete Module 02 first, then run 'tito module complete 02'.\n"
+        "   Original error: " + str(e)
+    ) from e

-        def forward(self, indices):
-            return Tensor(self.weight.data[indices.data.astype(int)])
-
-        def parameters(self):
-            return [self.weight]
-
-def gelu(x):
-    """GELU activation function."""
-    return Tensor(0.5 * x.data * (1 + np.tanh(np.sqrt(2 / np.pi) * (x.data + 0.044715 * x.data**3))))
+try:
+    from tinytorch.text.embeddings import Embedding, PositionalEncoding  # Module 11: Embeddings
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 13 (Transformers) requires Module 11 (Embeddings) to be completed first.\n"
+        "   Transformers need token and positional embeddings.\n"
+        "   Please complete Module 11 first, then run 'tito module complete 11'.\n"
+        "   Original error: " + str(e)
+    ) from e

 # %% [markdown]
 """
@@ -757,6 +695,7 @@ class MLP:
        # Two-layer feed-forward network
        self.linear1 = Linear(embed_dim, hidden_dim)
        self.linear2 = Linear(hidden_dim, embed_dim)
+        self.gelu = GELU()  # GELU activation from Module 02
        ### END SOLUTION

    def forward(self, x):
@@ -773,14 +712,14 @@ class MLP:
        COMPUTATION FLOW:
        x -> Linear -> GELU -> Linear -> output

-        HINT: GELU activation is implemented above as a function
+        HINT: Use self.gelu.forward() to apply GELU activation
        """
        ### BEGIN SOLUTION
        # First linear layer with expansion
        hidden = self.linear1.forward(x)

-        # GELU activation
-        hidden = gelu(hidden)
+        # GELU activation (from Module 02)
+        hidden = self.gelu.forward(hidden)

        # Second linear layer back to original size
        output = self.linear2.forward(hidden)
--- a/modules/16_compression/compression_dev.py
+++ b/modules/16_compression/compression_dev.py
@@ -65,66 +65,70 @@ import copy
 from typing import List, Dict, Any, Tuple, Optional
 import time

-# Import from previous modules
-# Note: In the full package, these would be imports like:
-# from tinytorch.core.tensor import Tensor
-# from tinytorch.core.layers import Linear
-# For development, we'll create minimal implementations
+"""
+## 🔗 Module Dependencies

-class Tensor:
-    """Minimal Tensor class for compression development - imports from Module 01 in practice."""
-    def __init__(self, data, requires_grad=False):
-        self.data = np.array(data)
-        self.shape = self.data.shape
-        self.size = self.data.size
-        self.requires_grad = requires_grad
-        self.grad = None
+This module REQUIRES completion of:
+- Module 01 (Tensor): Foundation data structure for weight storage
+- Module 03 (Layers): Linear layer structure that we compress
+- Module 15 (Quantization): Related optimization technique

-    def __add__(self, other):
-        if isinstance(other, Tensor):
-            return Tensor(self.data + other.data)
-        return Tensor(self.data + other)
+**Progressive Building**:
+```
+Module 01 (Tensor) ──┐
+                     ├──> Module 03 (Layers) ──┐
+Module 02 (Activations) ──┘                    ├──> Module 16 (Compression)
+                                               │
+Module 15 (Quantization) ────────────────────┘
+```

-    def __mul__(self, other):
-        if isinstance(other, Tensor):
-            return Tensor(self.data * other.data)
-        return Tensor(self.data * other)
+**What You've Built**:
+- Module 01: Tensor (what we compress)
+- Module 03: Linear layers (what we prune)
+- Module 15: Quantization (complementary optimization)

-    def matmul(self, other):
-        return Tensor(np.dot(self.data, other.data))
+**What This Module Adds**:
+- Pruning techniques (remove weights)
+- Knowledge distillation (compress knowledge)
+- Low-rank approximation (compress matrices)
+- Sparsity measurement

-    def abs(self):
-        return Tensor(np.abs(self.data))
+**To verify dependencies are met, run**:
+    python -c "from tinytorch.core.tensor import Tensor; print('✅ Module 01 ready')"
+    python -c "from tinytorch.core.layers import Linear; print('✅ Module 03 ready')"
+    python -c "from tinytorch.optimization.quantization import quantize_model; print('✅ Module 15 ready')"
+"""

-    def sum(self, axis=None):
-        return Tensor(self.data.sum(axis=axis))
+# Direct imports from previous modules - these MUST exist
+# If imports fail, students will get clear educational errors
+try:
+    from tinytorch.core.tensor import Tensor  # Module 01: Foundation
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 16 (Compression) requires Module 01 (Tensor) to be completed first.\n"
+        "   This module compresses Tensor weights - you need Tensor to exist first!\n"
+        "   Please complete Module 01 first, then run 'tito module complete 01'.\n"
+        "   Original error: " + str(e)
+    ) from e

-    def __repr__(self):
-        return f"Tensor(shape={self.shape})"
-
-class Linear:
-    """Minimal Linear layer for compression development - imports from Module 03 in practice."""
-    def __init__(self, in_features, out_features, bias=True):
-        self.in_features = in_features
-        self.out_features = out_features
-        # Initialize with He initialization
-        self.weight = Tensor(np.random.randn(in_features, out_features) * np.sqrt(2.0 / in_features))
-        self.bias = Tensor(np.zeros(out_features)) if bias else None
-
-    def forward(self, x):
-        output = x.matmul(self.weight)
-        if self.bias is not None:
-            output = output + self.bias
-        return output
-
-    def parameters(self):
-        params = [self.weight]
-        if self.bias is not None:
-            params.append(self.bias)
-        return params
+try:
+    from tinytorch.core.layers import Linear  # Module 03: What we compress
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 16 (Compression) requires Module 03 (Layers) to be completed first.\n"
+        "   This module prunes Linear layer weights - you need Linear layers first!\n"
+        "   Please complete Module 03 first, then run 'tito module complete 03'.\n"
+        "   Original error: " + str(e)
+    ) from e

+# Sequential is a simple container - define it here since it's not exported from Module 03
 class Sequential:
-    """Minimal Sequential container for model compression."""
+    """
+    Sequential container for model compression.
+    
+    Simple container that chains layers together.
+    This is a utility class for testing compression techniques.
+    """
    def __init__(self, *layers):
        self.layers = list(layers)

--- a/modules/18_acceleration/acceleration_dev.py
+++ b/modules/18_acceleration/acceleration_dev.py
@@ -126,47 +126,58 @@ Real-world performance wins:
 """

 # %% nbgrader={"grade": false, "grade_id": "tensor-import", "solution": true}
-# Import required dependencies
-### BEGIN SOLUTION
-# Import tensor from our implementation
-import sys
-import os
-sys.path.append('/Users/VJ/GitHub/TinyTorch')
+"""
+## 🔗 Module Dependencies
+
+This module REQUIRES completion of:
+- Module 01 (Tensor): Foundation data structure we optimize
+- Module 03 (Layers): Linear layers for vectorization
+- Module 14 (Profiling): Profiler for measuring improvements
+
+**Progressive Building**:
+```
+Module 01 (Tensor) ──> [This Module: Optimize Tensor operations]
+Module 03 (Layers) ──> [This Module: Optimize Linear layers]
+Module 14 (Profiling) ──> [This Module: Measure improvements]
+```
+
+**What You've Built**:
+- Module 01: Tensor (what we optimize)
+- Module 03: Linear layers (uses optimized ops)
+- Module 14: Profiling (measure improvements)
+
+**What This Module Adds**:
+- Vectorized operations (SIMD optimization)
+- Kernel fusion (memory efficiency)
+- Mixed precision training (memory/speed)
+
+**To verify dependencies are met, run**:
+    python -c "from tinytorch.core.tensor import Tensor; print('✅ Module 01 ready')"
+    python -c "from tinytorch.core.layers import Linear; print('✅ Module 03 ready')"
+    python -c "from tinytorch.profiling.profiler import Profiler; print('✅ Module 14 ready')"
+"""
+
+# Direct imports from previous modules - these MUST exist
+# If imports fail, students will get clear educational errors
+try:
+    from tinytorch.core.tensor import Tensor  # Module 01: What we optimize
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 18 (Acceleration) requires Module 01 (Tensor) to be completed first.\n"
+        "   This module optimizes Tensor operations - you need Tensor to exist first!\n"
+        "   Please complete Module 01 first, then run 'tito module complete 01'.\n"
+        "   Original error: " + str(e)
+    ) from e

 try:
-    # Import from the modules directory structure
-    import importlib.util
-    spec = importlib.util.spec_from_file_location("tensor_dev", "/Users/VJ/GitHub/TinyTorch/modules/01_tensor/tensor_dev.py")
-    tensor_module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(tensor_module)
-    Tensor = tensor_module.Tensor
-except ImportError:
-    # Fallback for testing
-    class Tensor:
-        def __init__(self, data, requires_grad=False):
-            self.data = np.array(data, dtype=np.float32)
-            self.shape = self.data.shape
-            self.requires_grad = requires_grad
-            self.grad = None
-
-        def __add__(self, other):
-            return Tensor(self.data + other.data)
-
-        def __mul__(self, other):
-            return Tensor(self.data * other.data)
-
-        def matmul(self, other):
-            return Tensor(np.dot(self.data, other.data))
-
-        def reshape(self, *shape):
-            return Tensor(self.data.reshape(shape))
-
-        def sum(self, axis=None):
-            return Tensor(self.data.sum(axis=axis))
-
-        def backward(self):
-            pass
-### END SOLUTION
+    from tinytorch.core.layers import Linear  # Module 03: Uses optimized ops
+except ImportError as e:
+    raise ImportError(
+        "❌ Module 18 (Acceleration) requires Module 03 (Layers) to be completed first.\n"
+        "   This module optimizes Linear layer operations.\n"
+        "   Please complete Module 03 first, then run 'tito module complete 03'.\n"
+        "   Original error: " + str(e)
+    ) from e

 # %% [markdown]
 """
--- a/modules/20_competition_ARCHIVED/competition.py
+++ b/modules/20_competition_ARCHIVED/competition.py
--- a/modules/20_competition_ARCHIVED/competition_dev.py
+++ b/modules/20_competition_ARCHIVED/competition_dev.py
--- a/modules/20_competition_ARCHIVED/module.yaml
+++ b/modules/20_competition_ARCHIVED/module.yaml