diff --git a/modules/layers/layers_dev.py b/modules/layers/layers_dev.py
index 04f18fa3..eaac932b 100644
--- a/modules/layers/layers_dev.py
+++ b/modules/layers/layers_dev.py
@@ -85,25 +85,80 @@ from tinytorch.core.activations import ReLU, Sigmoid, Tanh
 """
 ## Step 1: What is a Layer?
 
-A **layer** is a function that transforms tensors. Think of it as:
-- **Input**: Tensor with some shape
-- **Transformation**: Mathematical operation (linear, nonlinear, etc.)
-- **Output**: Tensor with possibly different shape
+### Definition
+A **layer** is a function that transforms tensors. Think of it as a mathematical operation that takes input data and produces output data:
 
-**The fundamental insight**: Neural networks are just function composition!
+```
+Input Tensor → Layer → Output Tensor
+```
+
+### Why Layers Matter in Neural Networks
+Layers are the fundamental building blocks of all neural networks because:
+- **Modularity**: Each layer has a specific job (linear transformation, nonlinearity, etc.)
+- **Composability**: Layers can be combined to create complex functions
+- **Learnability**: Each layer has parameters that can be learned from data
+- **Interpretability**: Different layers learn different features
+
+### The Fundamental Insight
+**Neural networks are just function composition!**
 ```
 x → Layer1 → Layer2 → Layer3 → y
 ```
 
-**Why layers matter**:
-- They're the building blocks of all neural networks
-- Each layer learns a different transformation
-- Composing layers creates complex functions
-- Understanding layers = understanding neural networks
+Each layer transforms the data, and the final output is the composition of all these transformations.
+
+### Real-World Examples
+- **Dense Layer**: Learns linear relationships between features
+- **Convolutional Layer**: Learns spatial patterns in images
+- **Recurrent Layer**: Learns temporal patterns in sequences
+- **Activation Layer**: Adds nonlinearity to make networks powerful
+
+### Visual Intuition
+```
+Input: [1, 2, 3] (3 features)
+Dense Layer: y = Wx + b
+Weights W: [[0.1, 0.2, 0.3],
+            [0.4, 0.5, 0.6]] (2×3 matrix)
+Bias b: [0.1, 0.2] (2 values)
+Output: [0.1*1 + 0.2*2 + 0.3*3 + 0.1,
+         0.4*1 + 0.5*2 + 0.6*3 + 0.2] = [1.4, 3.2]
+```
 
 Let's start with the most important layer: **Dense** (also called Linear or Fully Connected).
 """
 
+# %% [markdown]
+"""
+## Step 2: Understanding Matrix Multiplication
+
+Before we build layers, let's understand the core operation: **matrix multiplication**. This is what powers all neural network computations.
+
+### Why Matrix Multiplication Matters
+- **Efficiency**: Process multiple inputs at once
+- **Parallelization**: GPU acceleration works great with matrix operations
+- **Batch processing**: Handle multiple samples simultaneously
+- **Mathematical foundation**: Linear algebra is the language of neural networks
+
+### The Math Behind It
+For matrices A (m×n) and B (n×p), the result C (m×p) is:
+```
+C[i,j] = sum(A[i,k] * B[k,j] for k in range(n))
+```
+
+### Visual Example
+```
+A = [[1, 2],     B = [[5, 6],
+     [3, 4]]          [7, 8]]
+
+C = A @ B = [[1*5 + 2*7,  1*6 + 2*8],
+              [3*5 + 4*7,  3*6 + 4*8]]
+  = [[19, 22],
+     [43, 50]]
+```
+
+Let's implement this step by step!
+"""
+
 # %%
 #| export
 def matmul_naive(A: np.ndarray, B: np.ndarray) -> np.ndarray:
@@ -120,6 +175,30 @@ def matmul_naive(A: np.ndarray, B: np.ndarray) -> np.ndarray:
         Matrix of shape (m, p) where C[i,j] = sum(A[i,k] * B[k,j] for k in range(n))
         
     TODO: Implement matrix multiplication using three nested for-loops.
+    
+    APPROACH:
+    1. Get the dimensions: m, n from A and n2, p from B
+    2. Check that n == n2 (matrices must be compatible)
+    3. Create output matrix C of shape (m, p) filled with zeros
+    4. Use three nested loops:
+       - i loop: rows of A (0 to m-1)
+       - j loop: columns of B (0 to p-1) 
+       - k loop: shared dimension (0 to n-1)
+    5. For each (i,j), compute: C[i,j] += A[i,k] * B[k,j]
+    
+    EXAMPLE:
+    A = [[1, 2],     B = [[5, 6],
+         [3, 4]]          [7, 8]]
+    
+    C[0,0] = A[0,0]*B[0,0] + A[0,1]*B[1,0] = 1*5 + 2*7 = 19
+    C[0,1] = A[0,0]*B[0,1] + A[0,1]*B[1,1] = 1*6 + 2*8 = 22
+    C[1,0] = A[1,0]*B[0,0] + A[1,1]*B[1,0] = 3*5 + 4*7 = 43
+    C[1,1] = A[1,0]*B[0,1] + A[1,1]*B[1,1] = 3*6 + 4*8 = 50
+    
+    HINTS:
+    - Start with C = np.zeros((m, p))
+    - Use three nested for loops: for i in range(m): for j in range(p): for k in range(n):
+    - Accumulate the sum: C[i,j] += A[i,k] * B[k,j]
     """
     raise NotImplementedError("Student implementation required")
 
@@ -143,6 +222,81 @@ def matmul_naive(A: np.ndarray, B: np.ndarray) -> np.ndarray:
                 C[i, j] += A[i, k] * B[k, j]
     return C
 
+# %% [markdown]
+"""
+### 🧪 Test Your Matrix Multiplication
+"""
+
+# %%
+# Test matrix multiplication
+print("Testing matrix multiplication...")
+
+try:
+    # Test case 1: Simple 2x2 matrices
+    A = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    B = np.array([[5, 6], [7, 8]], dtype=np.float32)
+    
+    result = matmul_naive(A, B)
+    expected = np.array([[19, 22], [43, 50]], dtype=np.float32)
+    
+    print(f"✅ Matrix A:\n{A}")
+    print(f"✅ Matrix B:\n{B}")
+    print(f"✅ Your result:\n{result}")
+    print(f"✅ Expected:\n{expected}")
+    
+    assert np.allclose(result, expected), "❌ Result doesn't match expected!"
+    print("🎉 Matrix multiplication works!")
+    
+    # Test case 2: Compare with NumPy
+    numpy_result = A @ B
+    assert np.allclose(result, numpy_result), "❌ Doesn't match NumPy result!"
+    print("✅ Matches NumPy implementation!")
+    
+except Exception as e:
+    print(f"❌ Error: {e}")
+    print("Make sure to implement matmul_naive above!")
+
+# %% [markdown]
+"""
+## Step 3: Building the Dense Layer
+
+Now let's build the **Dense layer**, the most fundamental building block of neural networks. A Dense layer performs a linear transformation: `y = Wx + b`
+
+### What is a Dense Layer?
+- **Linear transformation**: `y = Wx + b`
+- **W**: Weight matrix (learnable parameters)
+- **x**: Input tensor
+- **b**: Bias vector (learnable parameters)
+- **y**: Output tensor
+
+### Why Dense Layers Matter
+- **Universal approximation**: Can approximate any function with enough neurons
+- **Feature learning**: Each neuron learns a different feature
+- **Nonlinearity**: When combined with activation functions, becomes very powerful
+- **Foundation**: All other layers build on this concept
+
+### The Math
+For input x of shape (batch_size, input_size):
+- **W**: Weight matrix of shape (input_size, output_size)
+- **b**: Bias vector of shape (output_size)
+- **y**: Output of shape (batch_size, output_size)
+
+### Visual Example
+```
+Input: x = [1, 2, 3] (3 features)
+Weights: W = [[0.1, 0.2],    Bias: b = [0.1, 0.2]
+              [0.3, 0.4],
+              [0.5, 0.6]]
+
+Step 1: Wx = [0.1*1 + 0.3*2 + 0.5*3,  0.2*1 + 0.4*2 + 0.6*3]
+            = [2.2, 3.2]
+
+Step 2: y = Wx + b = [2.2 + 0.1, 3.2 + 0.2] = [2.3, 3.4]
+```
+
+Let's implement this!
+"""
+
 # %%
 #| export
 class Dense:
@@ -159,6 +313,23 @@ class Dense:
         use_naive_matmul: Whether to use naive matrix multiplication (for learning)
         
     TODO: Implement the Dense layer with weight initialization and forward pass.
+    
+    APPROACH:
+    1. Store layer parameters (input_size, output_size, use_bias, use_naive_matmul)
+    2. Initialize weights with small random values (Xavier/Glorot initialization)
+    3. Initialize bias to zeros (if use_bias=True)
+    4. Implement forward pass using matrix multiplication and bias addition
+    
+    EXAMPLE:
+    layer = Dense(input_size=3, output_size=2)
+    x = Tensor([[1, 2, 3]])  # batch_size=1, input_size=3
+    y = layer(x)  # shape: (1, 2)
+    
+    HINTS:
+    - Use np.random.randn() for random initialization
+    - Scale weights by sqrt(2/(input_size + output_size)) for Xavier init
+    - Store weights and bias as numpy arrays
+    - Use matmul_naive or @ operator based on use_naive_matmul flag
     """
     
     def __init__(self, input_size: int, output_size: int, use_bias: bool = True, 
@@ -176,6 +347,18 @@ class Dense:
         1. Store layer parameters (input_size, output_size, use_bias, use_naive_matmul)
         2. Initialize weights with small random values
         3. Initialize bias to zeros (if use_bias=True)
+        
+        STEP-BY-STEP:
+        1. Store the parameters as instance variables
+        2. Calculate scale factor for Xavier initialization: sqrt(2/(input_size + output_size))
+        3. Initialize weights: np.random.randn(input_size, output_size) * scale
+        4. If use_bias=True, initialize bias: np.zeros(output_size)
+        5. If use_bias=False, set bias to None
+        
+        EXAMPLE:
+        Dense(3, 2) creates:
+        - weights: shape (3, 2) with small random values
+        - bias: shape (2,) with zeros
         """
         raise NotImplementedError("Student implementation required")
     
@@ -191,8 +374,27 @@ class Dense:
             
         TODO: Implement matrix multiplication and bias addition
         - Use self.use_naive_matmul to choose between NumPy and naive implementation
-        - If use_naive_matmul=True, use matmul_naive(x.data, self.weights.data)
-        - If use_naive_matmul=False, use x.data @ self.weights.data
+        - If use_naive_matmul=True, use matmul_naive(x.data, self.weights)
+        - If use_naive_matmul=False, use x.data @ self.weights
+        - Add bias if self.use_bias=True
+        
+        STEP-BY-STEP:
+        1. Perform matrix multiplication: Wx
+           - If use_naive_matmul: result = matmul_naive(x.data, self.weights)
+           - Else: result = x.data @ self.weights
+        2. Add bias if use_bias: result += self.bias
+        3. Return Tensor(result)
+        
+        EXAMPLE:
+        Input x: Tensor([[1, 2, 3]])  # shape (1, 3)
+        Weights: shape (3, 2)
+        Output: Tensor([[val1, val2]])  # shape (1, 2)
+        
+        HINTS:
+        - x.data gives you the numpy array
+        - self.weights is your weight matrix
+        - Use broadcasting for bias addition: result + self.bias
+        - Return Tensor(result) to wrap the result
         """
         raise NotImplementedError("Student implementation required")
     
@@ -213,40 +415,52 @@ class Dense:
     
     def __init__(self, input_size: int, output_size: int, use_bias: bool = True, 
                  use_naive_matmul: bool = False):
-        """Initialize Dense layer with random weights."""
+        """
+        Initialize Dense layer with random weights.
+        
+        Args:
+            input_size: Number of input features
+            output_size: Number of output features
+            use_bias: Whether to include bias term
+            use_naive_matmul: Use naive matrix multiplication (for learning)
+        """
+        # Store parameters
         self.input_size = input_size
         self.output_size = output_size
         self.use_bias = use_bias
         self.use_naive_matmul = use_naive_matmul
         
-        # Initialize weights with Xavier/Glorot initialization
-        # This helps with gradient flow during training
-        limit = math.sqrt(6.0 / (input_size + output_size))
-        self.weights = Tensor(
-            np.random.uniform(-limit, limit, (input_size, output_size)).astype(np.float32)
-        )
+        # Xavier/Glorot initialization
+        scale = np.sqrt(2.0 / (input_size + output_size))
+        self.weights = np.random.randn(input_size, output_size).astype(np.float32) * scale
         
-        # Initialize bias to zeros
+        # Initialize bias
         if use_bias:
-            self.bias = Tensor(np.zeros(output_size, dtype=np.float32))
+            self.bias = np.zeros(output_size, dtype=np.float32)
         else:
             self.bias = None
     
     def forward(self, x: Tensor) -> Tensor:
-        """Forward pass: y = Wx + b"""
-        # Choose matrix multiplication implementation
+        """
+        Forward pass: y = Wx + b
+        
+        Args:
+            x: Input tensor of shape (batch_size, input_size)
+            
+        Returns:
+            Output tensor of shape (batch_size, output_size)
+        """
+        # Matrix multiplication
         if self.use_naive_matmul:
-            # Use naive implementation (for learning)
-            output = Tensor(matmul_naive(x.data, self.weights.data))
+            result = matmul_naive(x.data, self.weights)
         else:
-            # Use NumPy's optimized implementation (for speed)
-            output = Tensor(x.data @ self.weights.data)
+            result = x.data @ self.weights
         
-        # Add bias if present
-        if self.bias is not None:
-            output = Tensor(output.data + self.bias.data)
+        # Add bias
+        if self.use_bias:
+            result += self.bias
         
-        return output
+        return Tensor(result)
     
     def __call__(self, x: Tensor) -> Tensor:
         """Make layer callable: layer(x) same as layer.forward(x)"""
@@ -255,36 +469,38 @@ class Dense:
 # %% [markdown]
 """
 ### 🧪 Test Your Dense Layer
-
-Once you implement the Dense layer above, run this cell to test it:
 """
 
 # %%
-# Test the Dense layer
+# Test Dense layer
+print("Testing Dense layer...")
+
 try:
-    print("=== Testing Dense Layer ===")
+    # Test basic Dense layer
+    layer = Dense(input_size=3, output_size=2, use_bias=True)
+    x = Tensor([[1, 2, 3]])  # batch_size=1, input_size=3
     
-    # Create a simple Dense layer: 3 inputs → 2 outputs
-    layer = Dense(input_size=3, output_size=2)
-    print(f"Created Dense layer: {layer.input_size} → {layer.output_size}")
-    print(f"Weights shape: {layer.weights.shape}")
-    print(f"Bias shape: {layer.bias.shape if layer.bias else 'No bias'}")
+    print(f"✅ Input shape: {x.shape}")
+    print(f"✅ Layer weights shape: {layer.weights.shape}")
+    print(f"✅ Layer bias shape: {layer.bias.shape}")
     
-    # Test with a single example
-    x = Tensor([[1.0, 2.0, 3.0]])  # Shape: (1, 3)
     y = layer(x)
-    print(f"Input shape: {x.shape}")
-    print(f"Output shape: {y.shape}")
-    print(f"Input: {x.data}")
-    print(f"Output: {y.data}")
+    print(f"✅ Output shape: {y.shape}")
+    print(f"✅ Output: {y}")
     
-    # Test with batch
-    x_batch = Tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])  # Shape: (2, 3)
-    y_batch = layer(x_batch)
-    print(f"\nBatch input shape: {x_batch.shape}")
-    print(f"Batch output shape: {y_batch.shape}")
+    # Test without bias
+    layer_no_bias = Dense(input_size=2, output_size=1, use_bias=False)
+    x2 = Tensor([[1, 2]])
+    y2 = layer_no_bias(x2)
+    print(f"✅ No bias output: {y2}")
     
-    print("✅ Dense layer working!")
+    # Test naive matrix multiplication
+    layer_naive = Dense(input_size=2, output_size=2, use_naive_matmul=True)
+    x3 = Tensor([[1, 2]])
+    y3 = layer_naive(x3)
+    print(f"✅ Naive matmul output: {y3}")
+    
+    print("\n🎉 All Dense layer tests passed!")
     
 except Exception as e:
     print(f"❌ Error: {e}")
@@ -292,369 +508,155 @@ except Exception as e:
 
 # %% [markdown]
 """
-## Step 1.5: Understanding Matrix Multiplication
+## Step 4: Composing Layers with Activations
 
-Let's compare the naive matrix multiplication with NumPy's optimized version!
+Now let's see how layers work together! A neural network is just layers composed with activation functions.
+
+### Why Layer Composition Matters
+- **Nonlinearity**: Activation functions make networks powerful
+- **Feature learning**: Each layer learns different levels of features
+- **Universal approximation**: Can approximate any function
+- **Modularity**: Easy to experiment with different architectures
+
+### The Pattern
+```
+Input → Dense → Activation → Dense → Activation → Output
+```
+
+### Real-World Example
+```
+Input: [1, 2, 3] (3 features)
+Dense(3→2): [1.4, 2.8] (linear transformation)
+ReLU: [1.4, 2.8] (nonlinearity)
+Dense(2→1): [3.2] (final prediction)
+```
+
+Let's build a simple network!
 """
 
 # %%
-# Test matrix multiplication implementations
+# Test layer composition
+print("Testing layer composition...")
+
 try:
-    print("=== Testing Matrix Multiplication Implementations ===")
+    # Create a simple network: Dense → ReLU → Dense
+    dense1 = Dense(input_size=3, output_size=2)
+    relu = ReLU()
+    dense2 = Dense(input_size=2, output_size=1)
     
-    # Create small test matrices
-    A = np.array([[1, 2], [3, 4]], dtype=np.float32)  # 2x2
-    B = np.array([[5, 6], [7, 8]], dtype=np.float32)  # 2x2
+    # Test input
+    x = Tensor([[1, 2, 3]])
+    print(f"✅ Input: {x}")
     
-    print(f"Matrix A (2x2):\n{A}")
-    print(f"Matrix B (2x2):\n{B}")
+    # Forward pass through the network
+    h1 = dense1(x)
+    print(f"✅ After Dense1: {h1}")
     
-    # Test NumPy's implementation
-    C_numpy = A @ B
-    print(f"\nNumPy result (A @ B):\n{C_numpy}")
+    h2 = relu(h1)
+    print(f"✅ After ReLU: {h2}")
     
-    # Test naive implementation
-    C_naive = matmul_naive(A, B)
-    print(f"Naive result:\n{C_naive}")
+    y = dense2(h2)
+    print(f"✅ Final output: {y}")
     
-    # Compare results
-    if np.allclose(C_numpy, C_naive):
-        print("✅ Both implementations give the same result!")
-    else:
-        print("❌ Results differ! Check your naive implementation.")
-    
-    # Show the computation step by step
-    print(f"\n📊 Step-by-step computation for C[0,0]:")
-    print(f"C[0,0] = A[0,0]*B[0,0] + A[0,1]*B[1,0]")
-    print(f"C[0,0] = {A[0,0]}*{B[0,0]} + {A[0,1]}*{B[1,0]}")
-    print(f"C[0,0] = {A[0,0]*B[0,0]} + {A[0,1]*B[1,0]}")
-    print(f"C[0,0] = {A[0,0]*B[0,0] + A[0,1]*B[1,0]}")
-    print(f"Expected: {C_numpy[0,0]}")
+    print("\n🎉 Layer composition works!")
+    print("This is how neural networks work: layers + activations!")
     
 except Exception as e:
     print(f"❌ Error: {e}")
-    print("Make sure to implement matmul_naive above!")
+    print("Make sure all your layers and activations are working!")
+
+# %% [markdown]
+"""
+## Step 5: Performance Comparison
+
+Let's compare our naive matrix multiplication with NumPy's optimized version to understand why optimization matters in ML.
+
+### Why Performance Matters
+- **Training time**: Neural networks train for hours/days
+- **Inference speed**: Real-time applications need fast predictions
+- **GPU utilization**: Optimized operations use hardware efficiently
+- **Scalability**: Large models need efficient implementations
+"""
 
 # %%
 # Performance comparison
+print("Comparing naive vs NumPy matrix multiplication...")
+
 try:
-    print("=== Performance Comparison ===")
-    
-    # Create larger matrices for timing
-    size = 50
-    A = np.random.randn(size, size).astype(np.float32)
-    B = np.random.randn(size, size).astype(np.float32)
-    
     import time
     
-    # Time NumPy implementation
-    start_time = time.time()
-    C_numpy = A @ B
-    numpy_time = time.time() - start_time
+    # Create test matrices
+    A = np.random.randn(100, 100).astype(np.float32)
+    B = np.random.randn(100, 100).astype(np.float32)
     
     # Time naive implementation
     start_time = time.time()
-    C_naive = matmul_naive(A, B)
+    result_naive = matmul_naive(A, B)
     naive_time = time.time() - start_time
     
-    print(f"Matrix size: {size}x{size}")
-    print(f"NumPy time: {numpy_time:.6f} seconds")
-    print(f"Naive time: {naive_time:.6f} seconds")
-    print(f"Speedup: {naive_time/numpy_time:.1f}x slower")
+    # Time NumPy implementation
+    start_time = time.time()
+    result_numpy = A @ B
+    numpy_time = time.time() - start_time
     
-    # Verify results are the same
-    if np.allclose(C_numpy, C_naive):
-        print("✅ Results are identical!")
-    else:
-        print("❌ Results differ!")
+    print(f"✅ Naive time: {naive_time:.4f} seconds")
+    print(f"✅ NumPy time: {numpy_time:.4f} seconds")
+    print(f"✅ Speedup: {naive_time/numpy_time:.1f}x faster")
     
-    print(f"\n💡 Why is NumPy so much faster?")
-    print(f"   • Vectorized operations (no Python loops)")
-    print(f"   • Optimized C/Fortran backend")
-    print(f"   • Cache-friendly memory access")
-    print(f"   • Parallel processing")
+    # Verify correctness
+    assert np.allclose(result_naive, result_numpy), "Results don't match!"
+    print("✅ Results are identical!")
+    
+    print("\n💡 This is why we use optimized libraries in production!")
     
 except Exception as e:
     print(f"❌ Error: {e}")
-    print("Make sure to implement matmul_naive above!")
-
-# %%
-# Test Dense layer with both implementations
-try:
-    print("=== Testing Dense Layer with Both Implementations ===")
-    
-    # Create test data
-    x = Tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])  # Shape: (2, 3)
-    
-    # Test with NumPy implementation
-    layer_numpy = Dense(input_size=3, output_size=2, use_naive_matmul=False)
-    y_numpy = layer_numpy(x)
-    
-    # Test with naive implementation
-    layer_naive = Dense(input_size=3, output_size=2, use_naive_matmul=True)
-    y_naive = layer_naive(x)
-    
-    print(f"Input shape: {x.shape}")
-    print(f"NumPy output: {y_numpy.data}")
-    print(f"Naive output: {y_naive.data}")
-    
-    # Compare results
-    if np.allclose(y_numpy.data, y_naive.data):
-        print("✅ Both Dense implementations give the same result!")
-    else:
-        print("❌ Results differ! Check your implementations.")
-    
-    print(f"\n🎯 Key Insight:")
-    print(f"   • Both implementations compute the same mathematical operation")
-    print(f"   • NumPy is much faster but hides the computation")
-    print(f"   • Naive implementation shows you exactly what's happening")
-    print(f"   • Understanding the naive version helps you understand neural networks!")
-    
-except Exception as e:
-    print(f"❌ Error: {e}")
-    print("Make sure to implement both matmul_naive and Dense layer!")
 
 # %% [markdown]
 """
-## Step 2: Activation Functions - Adding Nonlinearity
+## 🎯 Module Summary
 
-Now we'll use the activation functions from the **activations** module! 
+Congratulations! You've built the foundation of neural network layers:
 
-**Clean Architecture**: We import the activation functions rather than redefining them:
-```python
-from tinytorch.core.activations import ReLU, Sigmoid, Tanh
-```
+### What You've Accomplished
+✅ **Matrix Multiplication**: Understanding the core operation  
+✅ **Dense Layer**: Linear transformation with weights and bias  
+✅ **Layer Composition**: Combining layers with activations  
+✅ **Performance Awareness**: Understanding optimization importance  
+✅ **Testing**: Immediate feedback on your implementations  
 
-**Why this matters**:
-- **Separation of concerns**: Math functions vs. layer building blocks
-- **Reusability**: Activations can be used anywhere in the system
-- **Maintainability**: One place to update activation implementations
-- **Composability**: Clean imports make neural networks easier to build
+### Key Concepts You've Learned
+- **Layers** are functions that transform tensors
+- **Matrix multiplication** powers all neural network computations
+- **Dense layers** perform linear transformations: `y = Wx + b`
+- **Layer composition** creates complex functions from simple building blocks
+- **Performance** matters for real-world ML applications
 
-**Why nonlinearity matters**: Without it, stacking layers is pointless!
-```
-Linear → Linear → Linear = Just one big Linear transformation
-Linear → NonLinear → Linear = Can learn complex patterns
-```
-"""
+### What's Next
+In the next modules, you'll build on this foundation:
+- **Networks**: Compose layers into complete models
+- **Training**: Learn parameters with gradients and optimization
+- **Convolutional layers**: Process spatial data like images
+- **Recurrent layers**: Process sequential data like text
 
-# %% [markdown]
-"""
-### 🧪 Test Activation Functions from Activations Module
+### Real-World Connection
+Your Dense layer is now ready to:
+- Learn patterns in data through weight updates
+- Transform features for classification and regression
+- Serve as building blocks for complex architectures
+- Integrate with the rest of the TinyTorch ecosystem
 
-Let's test that we can use the activation functions from the activations module:
+**Ready for the next challenge?** Let's move on to building complete neural networks!
 """
 
 # %%
-# Test activation functions from activations module
-try:
-    print("=== Testing Activation Functions from Activations Module ===")
-    
-    # Test data: mix of positive, negative, and zero
-    x = Tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]])
-    print(f"Input: {x.data}")
-    
-    # Test ReLU from activations module
-    relu = ReLU()
-    y_relu = relu(x)
-    print(f"ReLU output: {y_relu.data}")
-    
-    # Test Sigmoid from activations module
-    sigmoid = Sigmoid()
-    y_sigmoid = sigmoid(x)
-    print(f"Sigmoid output: {y_sigmoid.data}")
-    
-    # Test Tanh from activations module
-    tanh = Tanh()
-    y_tanh = tanh(x)
-    print(f"Tanh output: {y_tanh.data}")
-    
-    print("✅ Activation functions from activations module working!")
-    print("🎉 Clean architecture: layers module uses activations module!")
-    
-except Exception as e:
-    print(f"❌ Error: {e}")
-    print("Make sure the activations module is properly exported!")
-
-# %% [markdown]
-"""
-## Step 3: Layer Composition - Building Neural Networks
-
-Now comes the magic! We can **compose** layers to build neural networks:
-
-```
-Input → Dense → ReLU → Dense → Sigmoid → Output
-```
-
-This is a 2-layer neural network that can learn complex nonlinear patterns!
-
-**Notice the clean architecture**:
-- Dense layers handle linear transformations
-- Activation functions (from activations module) handle nonlinearity
-- Composition creates complex behaviors from simple building blocks
-"""
-
-# %%
-# Build a simple 2-layer neural network
-try:
-    print("=== Building a 2-Layer Neural Network ===")
-    
-    # Network architecture: 3 → 4 → 2
-    # Input: 3 features
-    # Hidden: 4 neurons with ReLU
-    # Output: 2 neurons with Sigmoid
-    
-    layer1 = Dense(input_size=3, output_size=4)
-    activation1 = ReLU()  # From activations module
-    layer2 = Dense(input_size=4, output_size=2)
-    activation2 = Sigmoid()  # From activations module
-    
-    print("Network architecture:")
-    print(f"  Input: 3 features")
-    print(f"  Hidden: {layer1.input_size} → {layer1.output_size} (Dense + ReLU)")
-    print(f"  Output: {layer2.input_size} → {layer2.output_size} (Dense + Sigmoid)")
-    
-    # Test with sample data
-    x = Tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])  # 2 examples, 3 features each
-    print(f"\nInput shape: {x.shape}")
-    print(f"Input data: {x.data}")
-    
-    # Forward pass through the network
-    h1 = layer1(x)           # Dense layer 1
-    h1_activated = activation1(h1)  # ReLU activation
-    h2 = layer2(h1_activated)       # Dense layer 2  
-    output = activation2(h2)        # Sigmoid activation
-    
-    print(f"\nAfter layer 1: {h1.shape}")
-    print(f"After ReLU: {h1_activated.shape}")
-    print(f"After layer 2: {h2.shape}")
-    print(f"Final output: {output.shape}")
-    print(f"Output values: {output.data}")
-    
-    print("\n🎉 Neural network working! You just built your first neural network!")
-    print("🏗️  Clean architecture: Dense layers + Activations module = Neural Network")
-    print("Notice how the network transforms 3D input into 2D output through learned transformations.")
-    
-except Exception as e:
-    print(f"❌ Error: {e}")
-    print("Make sure to implement the layers and check activations module!")
-
-# %% [markdown]
-"""
-## Step 4: Understanding What We Built
-
-Congratulations! You just implemented a clean, modular neural network architecture:
-
-### 🧱 **What You Built**
-1. **Dense Layer**: Linear transformation `y = Wx + b`
-2. **Activation Functions**: Imported from activations module (ReLU, Sigmoid, Tanh)
-3. **Layer Composition**: Chaining layers to build networks
-
-### 🏗️ **Clean Architecture Benefits**
-- **Separation of concerns**: Math functions vs. layer building blocks
-- **Reusability**: Activations can be used across different modules
-- **Maintainability**: One place to update activation implementations
-- **Composability**: Clean imports make complex networks easier to build
-
-### 🎯 **Key Insights**
-- **Layers are functions**: They transform tensors from one space to another
-- **Composition creates complexity**: Simple layers → complex networks
-- **Nonlinearity is crucial**: Without it, deep networks are just linear transformations
-- **Neural networks are function approximators**: They learn to map inputs to outputs
-- **Modular design**: Building blocks can be combined in many ways
-
-### 🚀 **What's Next**
-In the next modules, you'll learn:
-- **Training**: How networks learn from data (backpropagation, optimizers)
-- **Architectures**: Specialized layers for different problems (CNNs, RNNs)
-- **Applications**: Using networks for real problems
-
-### 🔧 **Export to Package**
-Run this to export your layers to the TinyTorch package:
-```bash
-python bin/tito.py sync
-```
-
-Then test your implementation:
-```bash
-python bin/tito.py test --module layers
-```
-
-**Great job! You've built a clean, modular foundation for neural networks!** 🎉
-"""
-
-# %%
-# Final demonstration: A more complex example
-try:
-    print("=== Final Demo: Image Classification Network ===")
-    
-    # Simulate a small image: 28x28 pixels flattened to 784 features
-    # This is like a tiny MNIST digit
-    image_size = 28 * 28  # 784 pixels
-    num_classes = 10      # 10 digits (0-9)
-    
-    # Build a 3-layer network for digit classification
-    # 784 → 128 → 64 → 10
-    layer1 = Dense(input_size=image_size, output_size=128)
-    relu1 = ReLU()  # From activations module
-    layer2 = Dense(input_size=128, output_size=64)
-    relu2 = ReLU()  # From activations module
-    layer3 = Dense(input_size=64, output_size=num_classes)
-    softmax = Sigmoid()  # Using Sigmoid as a simple "probability-like" output
-    
-    print(f"Image classification network:")
-    print(f"  Input: {image_size} pixels (28x28 image)")
-    print(f"  Hidden 1: {layer1.input_size} → {layer1.output_size} (Dense + ReLU)")
-    print(f"  Hidden 2: {layer2.input_size} → {layer2.output_size} (Dense + ReLU)")
-    print(f"  Output: {layer3.input_size} → {layer3.output_size} (Dense + Sigmoid)")
-    
-    # Simulate a batch of 5 images
-    batch_size = 5
-    fake_images = Tensor(np.random.randn(batch_size, image_size).astype(np.float32))
-    
-    # Forward pass
-    h1 = relu1(layer1(fake_images))
-    h2 = relu2(layer2(h1))
-    predictions = softmax(layer3(h2))
-    
-    print(f"\nBatch processing:")
-    print(f"  Input batch shape: {fake_images.shape}")
-    print(f"  Predictions shape: {predictions.shape}")
-    print(f"  Sample predictions: {predictions.data[0]}")  # First image predictions
-    
-    print("\n🎉 You built a neural network that could classify images!")
-    print("🏗️  Clean architecture: Dense layers + Activations module = Image Classifier")
-    print("With training, this network could learn to recognize handwritten digits!")
-    
-except Exception as e:
-    print(f"❌ Error: {e}")
-    print("Check your layer implementations and activations module!")
-
-# %% [markdown]
-"""
-## 🎓 Module Summary
-
-### What You Learned
-1. **Layer Architecture**: Dense layers as linear transformations
-2. **Clean Dependencies**: Layers module uses activations module
-3. **Function Composition**: Simple building blocks → complex networks
-4. **Modular Design**: Separation of concerns for maintainable code
-
-### Key Architectural Insight
-```
-activations (math functions) → layers (building blocks) → networks (applications)
-```
-
-This clean dependency graph makes the system:
-- **Understandable**: Each module has a clear purpose
-- **Testable**: Each module can be tested independently
-- **Reusable**: Components can be used across different contexts
-- **Maintainable**: Changes are localized to appropriate modules
-
-### Next Steps
-- **Training**: Learn how networks learn from data
-- **Advanced Architectures**: CNNs, RNNs, Transformers
-- **Applications**: Real-world machine learning problems
-
-**Congratulations on building a clean, modular neural network foundation!** 🚀
-""" 
\ No newline at end of file
+# Final verification
+print("\n" + "="*50)
+print("🎉 LAYERS MODULE COMPLETE!")
+print("="*50)
+print("✅ Matrix multiplication understanding")
+print("✅ Dense layer implementation")
+print("✅ Layer composition with activations")
+print("✅ Performance awareness")
+print("✅ Comprehensive testing")
+print("\n🚀 Ready to build networks in the next module!") 
\ No newline at end of file