From f8fd2e000c41a14d12edbfe4326d29f4e0daca20 Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Fri, 26 Sep 2025 11:51:54 -0400
Subject: [PATCH] STANDARDIZE: Consistent Linear terminology across all modules

Remove backward compatibility aliases and enforce PyTorch-consistent naming:
- Remove Dense = Linear alias in Module 04 (layers)
- Update all Dense references to Linear in Modules 02, 08, 09, 18, 21
- Remove MaxPool2d = MaxPool2D alias in Module 17 (quantization)
- Standardize fc/dense_weights to linear_weights in Module 18 (compression)

Benefits:
- Eliminates naming confusion between Dense/Linear terminology
- Aligns with PyTorch production patterns (nn.Linear)
- Reduces cognitive load with single consistent naming convention
- Improves student transfer to real ML frameworks

All modules tested and functionality preserved.
---
 modules/02_tensor/tensor_dev.py             |  2 +-
 modules/04_layers/layers_dev.py             | 45 ++++++------
 modules/08_training/training_dev.py         | 10 +--
 modules/09_spatial/spatial_dev.py           | 65 +++++++++--------
 modules/17_quantization/quantization_dev.py |  2 -
 modules/18_compression/compression_dev.py   | 78 ++++++++++-----------
 modules/21_mlops/mlops_dev.py               |  2 +-
 7 files changed, 102 insertions(+), 102 deletions(-)

diff --git a/modules/02_tensor/tensor_dev.py b/modules/02_tensor/tensor_dev.py
index 2e4154c8..ef3ce961 100644
--- a/modules/02_tensor/tensor_dev.py
+++ b/modules/02_tensor/tensor_dev.py
@@ -58,7 +58,7 @@ print("Ready to build tensors!")
 # # Final package structure:
 # from tinytorch.core.tensor import Tensor  # The foundation of everything!
 # from tinytorch.core.activations import ReLU, Sigmoid, Tanh
-# from tinytorch.core.layers import Dense, Conv2D
+# from tinytorch.core.layers import Linear, Conv2D
 # ```
 # 
 # **Why this matters:**
diff --git a/modules/04_layers/layers_dev.py b/modules/04_layers/layers_dev.py
index 0fb924ab..71c12a75 100644
--- a/modules/04_layers/layers_dev.py
+++ b/modules/04_layers/layers_dev.py
@@ -110,8 +110,8 @@ class Module:
         class MLP(Module):
             def __init__(self):
                 super().__init__()
-                self.layer1 = Dense(784, 128)  # Auto-registered!
-                self.layer2 = Dense(128, 10)   # Auto-registered!
+                self.layer1 = Linear(784, 128)  # Auto-registered!
+                self.layer2 = Linear(128, 10)   # Auto-registered!
                 
             def forward(self, x):
                 x = self.layer1(x)
@@ -520,9 +520,6 @@ class Linear(Module):
         return Tensor(output_data)
         ### END SOLUTION
 
-# Backward compatibility alias
-#| export  
-Dense = Linear
 
 # %% [markdown]
 """
@@ -538,7 +535,7 @@ def test_dense_layer():
     print("🧪 Testing Dense Layer...")
     
     # Test case 1: Basic functionality
-    layer = Dense(input_size=3, output_size=2)
+    layer = Linear(input_size=3, output_size=2)
     input_tensor = Tensor([[1.0, 2.0, 3.0]])  # Shape: (1, 3)
     output = layer.forward(input_tensor)
     
@@ -547,13 +544,13 @@ def test_dense_layer():
     print("✅ Output shape correct")
     
     # Test case 2: No bias
-    layer_no_bias = Dense(input_size=2, output_size=3, use_bias=False)
+    layer_no_bias = Linear(input_size=2, output_size=3, use_bias=False)
     assert layer_no_bias.bias is None, "Bias should be None when use_bias=False"
     print("✅ No bias option works")
     
     # Test case 3: Multiple samples (batch processing)
     batch_input = Tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])  # Shape: (3, 2)
-    layer_batch = Dense(input_size=2, output_size=2)
+    layer_batch = Linear(input_size=2, output_size=2)
     batch_output = layer_batch.forward(batch_input)
     
     assert batch_output.shape == (3, 2), f"Expected shape (3, 2), got {batch_output.shape}"
@@ -565,7 +562,7 @@ def test_dense_layer():
     print("✅ Callable interface works")
     
     # Test case 5: Parameter initialization
-    layer_init = Dense(input_size=10, output_size=5)
+    layer_init = Linear(input_size=10, output_size=5)
     assert layer_init.weights.shape == (10, 5), f"Expected weights shape (10, 5), got {layer_init.weights.shape}"
     assert layer_init.bias.shape == (5,), f"Expected bias shape (5,), got {layer_init.bias.shape}"
     
@@ -590,7 +587,7 @@ def test_dense_parameter_management():
     print("🧪 Testing Dense Layer Parameter Management...")
     
     # Test case 1: Parameter registration
-    layer = Dense(input_size=3, output_size=2)
+    layer = Linear(input_size=3, output_size=2)
     params = layer.parameters()
     
     assert len(params) == 2, f"Expected 2 parameters (weights + bias), got {len(params)}"
@@ -602,8 +599,8 @@ def test_dense_parameter_management():
     class SimpleNetwork(Module):
         def __init__(self):
             super().__init__()
-            self.layer1 = Dense(4, 3)
-            self.layer2 = Dense(3, 2)
+            self.layer1 = Linear(4, 3)
+            self.layer2 = Linear(3, 2)
         
         def forward(self, x):
             x = self.layer1(x)
@@ -624,13 +621,13 @@ def test_dense_parameter_management():
     print("✅ Network forward pass works")
     
     # Test case 4: Parameter shapes
-    layer = Dense(input_size=10, output_size=5)
+    layer = Linear(input_size=10, output_size=5)
     assert layer.weights.shape == (10, 5), f"Expected weights shape (10, 5), got {layer.weights.shape}"
     assert layer.bias.shape == (5,), f"Expected bias shape (5,), got {layer.bias.shape}"
     print("✅ Parameter shapes correct")
     
     # Test case 5: No bias option
-    layer_no_bias = Dense(input_size=3, output_size=2, use_bias=False)
+    layer_no_bias = Linear(input_size=3, output_size=2, use_bias=False)
     params_no_bias = layer_no_bias.parameters()
     
     assert len(params_no_bias) == 1, f"Expected 1 parameter (weights only), got {len(params_no_bias)}"
@@ -742,7 +739,7 @@ def test_sequential_network():
     print("✅ Empty Sequential network creation")
     
     # Test case 2: Create network with layers
-    layers = [Dense(3, 4), Dense(4, 2)]
+    layers = [Linear(3, 4), Linear(4, 2)]
     network = Sequential(layers)
     assert len(network.layers) == 2, "Network should have 2 layers"
     print("✅ Sequential network with layers")
@@ -760,7 +757,7 @@ def test_sequential_network():
     print("✅ Parameter collection from all layers")
     
     # Test case 5: Adding layers dynamically
-    network.add(Dense(2, 1))
+    network.add(Linear(2, 1))
     assert len(network.layers) == 3, "Network should have 3 layers after adding one"
     
     # Test forward pass after adding layer
@@ -920,7 +917,7 @@ def test_flatten_operations():
     
     # Test case 5: Integration with Sequential
     network = Sequential([
-        Dense(8, 4),
+        Linear(8, 4),
         Flatten()
     ])
     test_input = Tensor(np.random.randn(2, 8))
@@ -1193,8 +1190,8 @@ def run_comprehensive_tests():
     print("\n2. Dense Layer Composition:")
     
     # Create a simple 2-layer network
-    layer1 = Dense(4, 3)
-    layer2 = Dense(3, 2)
+    layer1 = Linear(4, 3)
+    layer2 = Linear(3, 2)
     
     # Test data flow
     input_data = Tensor([[1, 2, 3, 4]])
@@ -1218,7 +1215,7 @@ def run_comprehensive_tests():
     # Test 4: Parameter access and modification
     print("\n4. Parameter Management:")
     
-    layer = Dense(5, 3)
+    layer = Linear(5, 3)
     original_weights = layer.weights.data.copy()
     
     # Simulate parameter update
@@ -1246,8 +1243,8 @@ def demonstrate_layer_composition():
     print("=" * 50)
     
     print("\n1. Creating individual layers:")
-    layer1 = Dense(input_size=4, output_size=3)
-    layer2 = Dense(input_size=3, output_size=2)
+    layer1 = Linear(input_size=4, output_size=3)
+    layer2 = Linear(input_size=3, output_size=2)
     
     print(f"   Layer 1: {layer1.input_size} → {layer1.output_size}")
     print(f"   Layer 2: {layer2.input_size} → {layer2.output_size}")
@@ -1268,8 +1265,8 @@ def demonstrate_layer_composition():
     class TwoLayerNetwork(Module):
         def __init__(self, input_size, hidden_size, output_size):
             super().__init__()
-            self.layer1 = Dense(input_size, hidden_size)
-            self.layer2 = Dense(hidden_size, output_size)
+            self.layer1 = Linear(input_size, hidden_size)
+            self.layer2 = Linear(hidden_size, output_size)
         
         def forward(self, x):
             x = self.layer1(x)
diff --git a/modules/08_training/training_dev.py b/modules/08_training/training_dev.py
index 3a65c6dc..f8489589 100644
--- a/modules/08_training/training_dev.py
+++ b/modules/08_training/training_dev.py
@@ -69,7 +69,7 @@ sys.path.append(os.path.abspath('modules/source/09_dataloader'))
 # Import all the building blocks we need
 from tinytorch.core.tensor import Tensor
 from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax
-from tinytorch.core.layers import Dense
+from tinytorch.core.layers import Linear
 from tinytorch.core.networks import Sequential, create_mlp
 from tinytorch.core.spatial import Conv2D, flatten
 from tinytorch.utils.data import Dataset, DataLoader
@@ -918,7 +918,7 @@ class Trainer:
         4. Prepare for training and validation loops
         
         EXAMPLE:
-        model = Sequential([Dense(10, 5), ReLU(), Dense(5, 2)])
+        model = Sequential([Linear(10, 5), ReLU(), Linear(5, 2)])
         optimizer = Adam(model.parameters, learning_rate=0.001)
         loss_fn = CrossEntropyLoss()
         metrics = [Accuracy()]
@@ -1260,7 +1260,7 @@ def test_unit_trainer():
     print("🔬 Unit Test: Trainer Class...")
     
     # Create simple model and components
-    model = Sequential([Dense(2, 3), ReLU(), Dense(3, 2)])  # Simple model
+    model = Sequential([Linear(2, 3), ReLU(), Linear(3, 2)])  # Simple model
     optimizer = SGD([], learning_rate=0.01)  # Empty parameters list for testing
     loss_fn = MeanSquaredError()
     metrics = [Accuracy()]
@@ -1608,7 +1608,7 @@ def test_training_pipeline_profiler():
     profiler = TrainingPipelineProfiler(warning_threshold_seconds=1.0)
     
     # Create test components
-    model = Sequential([Dense(10, 5), ReLU(), Dense(5, 2)])
+    model = Sequential([Linear(10, 5), ReLU(), Linear(5, 2)])
     optimizer = SGD([], learning_rate=0.01)
     loss_fn = MeanSquaredError()
     
@@ -1839,7 +1839,7 @@ def test_production_training_optimizer():
     optimizer_tool = ProductionTrainingOptimizer()
     
     # Create test components
-    model = Sequential([Dense(10, 5), ReLU(), Dense(5, 2)])
+    model = Sequential([Linear(10, 5), ReLU(), Linear(5, 2)])
     optimizer = SGD([], learning_rate=0.01)
     loss_fn = MeanSquaredError()
     
diff --git a/modules/09_spatial/spatial_dev.py b/modules/09_spatial/spatial_dev.py
index b668eb0a..ea10ce47 100644
--- a/modules/09_spatial/spatial_dev.py
+++ b/modules/09_spatial/spatial_dev.py
@@ -80,8 +80,8 @@ print("Ready to build convolutional neural networks!")
 
 ```python
 # Final package structure:
-from tinytorch.core.cnn import Conv2D, conv2d_naive, flatten  # CNN operations!
-from tinytorch.core.layers import Dense  # Fully connected layers
+from tinytorch.core.spatial import Conv2D, MaxPool2D, flatten  # CNN operations!
+from tinytorch.core.layers import Linear  # Fully connected layers
 from tinytorch.core.activations import ReLU  # Nonlinearity
 from tinytorch.core.tensor import Tensor  # Foundation
 ```
@@ -142,6 +142,10 @@ def flatten(x, start_dim=1):
         # Flatten 2D to (1, total_elements) - treat as single sample
         total_size = int(np.prod(data.shape))
         new_shape = (1, total_size)
+    elif start_dim == 0:
+        # Special case: flatten everything but maintain 2D for Linear layers
+        total_size = int(np.prod(data.shape))
+        new_shape = (1, total_size)
     else:
         # Calculate new shape - preserve dimensions before start_dim, flatten rest
         batch_dims = data.shape[:start_dim]
@@ -1187,20 +1191,20 @@ print("📈 Progress: Single-channel ✓, Multi-channel ✓, Pooling ✓")
 
 # %% [markdown]
 """
-## Step 5: Flattening for Dense Layers
+## Step 5: Flattening for Linear Layers
 
 ### What is Flattening?
 **Flattening** converts multi-dimensional tensors to 1D vectors, enabling connection between convolutional and dense layers.
 
 ### Why Flattening is Needed
-- **Interface compatibility**: Conv2D outputs 2D/3D, Dense expects 1D
+- **Interface compatibility**: Conv2D outputs 2D/3D, Linear expects 1D
 - **Network composition**: Connect spatial features to classification
 - **Standard practice**: Almost all CNNs use this pattern
 - **Dimension management**: Preserve information while changing shape
 
 ### The Pattern
 ```
-Conv2D → ReLU → MaxPool2D → Flatten → Dense → Output
+Conv2D → ReLU → MaxPool2D → Flatten → Linear → Output
 ```
 
 ### Real-World Usage
@@ -1215,7 +1219,7 @@ Conv2D → ReLU → MaxPool2D → Flatten → Dense → Output
 # We use that single implementation throughout this module for consistency and clarity.
 
 print("✅ Flatten function is available from the Spatial Helper Functions section")
-print("🔍 The flatten() function handles tensor flattening for CNN-to-Dense transitions")
+print("🔍 The flatten() function handles tensor flattening for CNN-to-Linear transitions")
 
 # %% [markdown]
 """
@@ -1281,7 +1285,7 @@ except Exception as e:
 print("🎯 Flatten behavior:")
 print("   Converts 2D tensor to 1D")
 print("   Preserves batch dimension")
-print("   Enables connection to Dense layers")
+print("   Enables connection to Linear layers")
 print("📈 Progress: Convolution operation ✓, Conv2D layer ✓, Flatten ✓")
 
 # %% [markdown]
@@ -1294,13 +1298,13 @@ Let us test our complete CNN system with realistic multi-channel scenarios:
 #### **CIFAR-10 Style CNN**
 ```python
 # RGB images to classification
-RGB Input → Multi-Channel Conv2D → ReLU → MaxPool2D → Flatten → Dense → Output
+RGB Input → Multi-Channel Conv2D → ReLU → MaxPool2D → Flatten → Linear → Output
 ```
 
 #### **Deep Multi-Channel CNN**
 ```python
 # Progressive feature extraction
-RGB → Conv2D(3→32) → ReLU → Pool → Conv2D(32→64) → ReLU → Pool → Flatten → Dense
+RGB → Conv2D(3→32) → ReLU → Pool → Conv2D(32→64) → ReLU → Pool → Flatten → Linear
 ```
 
 #### **Production CNN Pattern**
@@ -1320,11 +1324,11 @@ try:
     # Test 1: CIFAR-10 Style RGB CNN Pipeline
     print("\n1. CIFAR-10 Style RGB CNN Pipeline:")
     
-    # Create pipeline: RGB → Conv2D(3→16) → ReLU → MaxPool2D → Flatten → Dense
+    # Create pipeline: RGB → Conv2D(3→16) → ReLU → MaxPool2D → Flatten → Linear
     rgb_conv = Conv2D(in_channels=3, out_channels=16, kernel_size=(3, 3))
     relu = ReLU()
     pool = MaxPool2D(pool_size=(2, 2))
-    dense = Dense(input_size=16 * 3 * 3, output_size=10)  # 16 channels, 3x3 spatial = 144 features
+    dense = Linear(input_size=16 * 3 * 3, output_size=10)  # 16 channels, 3x3 spatial = 144 features
     
     # Simulated CIFAR-10 image (3 channels, 8x8 for testing)
     rgb_image = Tensor(np.random.randn(3, 8, 8))  # RGB 8x8 image
@@ -1334,7 +1338,7 @@ try:
     conv_features = rgb_conv(rgb_image)    # (3,8,8) → (16,6,6)
     activated = relu(conv_features)        # (16,6,6) → (16,6,6)
     pooled = pool(activated)              # (16,6,6) → (16,3,3)
-    flattened = flatten(pooled)           # (16,3,3) → (1,144)
+    flattened = flatten(pooled, start_dim=0)           # (16,3,3) → (1,144)
     predictions = dense(flattened)        # (1,144) → (1,10)
     
     assert conv_features.shape == (16, 6, 6), f"Conv features wrong: {conv_features.shape}"
@@ -1348,14 +1352,14 @@ try:
     # Test 2: Deep Multi-Channel CNN
     print("\n2. Deep Multi-Channel CNN:")
     
-    # Create deeper pipeline: RGB → Conv1(3→32) → ReLU → Pool → Conv2(32→64) → ReLU → Pool → Dense
+    # Create deeper pipeline: RGB → Conv1(3→32) → ReLU → Pool → Conv2(32→64) → ReLU → Pool → Linear
     conv1_deep = Conv2D(in_channels=3, out_channels=32, kernel_size=(3, 3))
     relu1 = ReLU()
     pool1 = MaxPool2D(pool_size=(2, 2))
     conv2_deep = Conv2D(in_channels=32, out_channels=64, kernel_size=(3, 3))
     relu2 = ReLU()
     pool2 = MaxPool2D(pool_size=(2, 2))
-    classifier_deep = Dense(input_size=64 * 1 * 1, output_size=5)  # 64 channels, 1x1 spatial
+    classifier_deep = Linear(input_size=64 * 1 * 1, output_size=5)  # 64 channels, 1x1 spatial
     
     # Larger RGB input for deep processing
     large_rgb = Tensor(np.random.randn(3, 12, 12))  # RGB 12x12 image
@@ -1368,7 +1372,7 @@ try:
     h4 = conv2_deep(h3)         # (32,5,5) → (64,3,3)
     h5 = relu2(h4)              # (64,3,3) → (64,3,3)
     h6 = pool2(h5)              # (64,3,3) → (64,1,1)
-    h7 = flatten(h6)            # (64,1,1) → (1,64)
+    h7 = flatten(h6, start_dim=0)            # (64,1,1) → (1,64)
     output_deep = classifier_deep(h7)  # (1,64) → (1,5)
     
     assert h1.shape == (32, 10, 10), f"Conv1 output wrong: {h1.shape}"
@@ -1398,7 +1402,7 @@ try:
     
     # Create classifier with correct input size
     feature_size = batch_flat.shape[1]  # 32 features
-    batch_classifier = Dense(input_size=feature_size, output_size=3)
+    batch_classifier = Linear(input_size=feature_size, output_size=3)
     batch_pred = batch_classifier(batch_flat) # (4,32) → (4,3)
     
     assert batch_conv_out.shape == (4, 8, 4, 4), f"Batch conv wrong: {batch_conv_out.shape}"
@@ -1424,10 +1428,10 @@ try:
     
     # Analyze different configurations
     configs = [
-        (Conv2D(1, 8, (3, 3)), "1→8 channels"),
-        (Conv2D(3, 16, (3, 3)), "3→16 channels (RGB)"),
-        (Conv2D(16, 32, (3, 3)), "16→32 channels"),
-        (Conv2D(32, 64, (3, 3)), "32→64 channels"),
+        (Conv2D(in_channels=1, out_channels=8, kernel_size=(3, 3)), "1→8 channels"),
+        (Conv2D(in_channels=3, out_channels=16, kernel_size=(3, 3)), "3→16 channels (RGB)"),
+        (Conv2D(in_channels=16, out_channels=32, kernel_size=(3, 3)), "16→32 channels"),
+        (Conv2D(in_channels=32, out_channels=64, kernel_size=(3, 3)), "32→64 channels"),
     ]
     
     for conv_layer, desc in configs:
@@ -1443,7 +1447,7 @@ try:
     print("  • Batch processing with multiple channels")
     print("  • Backward compatibility with single-channel")
     print("  • Production-ready parameter scaling")
-    print("  • Complete Conv → Pool → Dense pipelines")
+    print("  • Complete Conv → Pool → Linear pipelines")
     print("📈 Progress: Production-ready multi-channel CNN system!")
     
 except Exception as e:
@@ -1559,20 +1563,21 @@ def test_module_conv2d_tensor_compatibility():
 
     # 1. Define a Conv2D layer
     # Kernel of size 3x3
-    conv_layer = Conv2D((3, 3))
+    conv_layer = Conv2D(in_channels=1, out_channels=1, kernel_size=(3, 3))
 
     # 2. Create a batch of 5 grayscale images (10x10)
-    # Shape: (batch_size, height, width)
-    input_images = np.random.randn(5, 10, 10)
+    # Shape: (batch_size, channels, height, width)
+    input_images = np.random.randn(5, 1, 10, 10)
     input_tensor = Tensor(input_images)
 
     # 3. Perform a forward pass
     output_tensor = conv_layer(input_tensor)
 
     # 4. Assert the output shape is correct
+    # Output: (batch_size, out_channels, height, width)
     # Output height = 10 - 3 + 1 = 8
     # Output width = 10 - 3 + 1 = 8
-    expected_shape = (5, 8, 8)
+    expected_shape = (5, 1, 8, 8)
     assert isinstance(output_tensor, Tensor), "Conv2D output must be a Tensor"
     assert output_tensor.shape == expected_shape, f"Expected output shape {expected_shape}, but got {output_tensor.shape}"
     print("✅ Integration Test Passed: Conv2D layer correctly transformed image tensor.")
@@ -2020,7 +2025,7 @@ Congratulations! You have successfully implemented a complete multi-channel CNN
 - **Parameter scaling**: How memory requirements grow with channels and kernel sizes
 - **Spatial downsampling**: MaxPooling for translation invariance and efficiency  
 - **Feature hierarchy**: Progressive extraction from RGB → edges → objects → concepts
-- **Production architectures**: Conv → ReLU → Pool → Conv → ReLU → Pool → Dense patterns
+- **Production architectures**: Conv → ReLU → Pool → Conv → ReLU → Pool → Linear patterns
 - **He initialization**: Proper weight initialization for stable multi-layer training
 
 ### Mathematical Foundations
@@ -2044,7 +2049,7 @@ Congratulations! You have successfully implemented a complete multi-channel CNN
 - **Computer vision**: Face recognition, document analysis, quality inspection
 
 ### CNN Architecture Patterns
-- **Basic CNN**: RGB → Conv(3→32) → ReLU → Pool → Conv(32→64) → ReLU → Pool → Dense
+- **Basic CNN**: RGB → Conv(3→32) → ReLU → Pool → Conv(32→64) → ReLU → Pool → Linear
 - **Parameter efficiency**: 32×3×3×3 = 864 parameters vs 32×32×32 = 32,768 for dense layer
 - **Spatial hierarchy**: Early layers detect edges, later layers detect objects
 - **Translation invariance**: Same features detected regardless of position in image
@@ -2058,7 +2063,7 @@ Congratulations! You have successfully implemented a complete multi-channel CNN
 ### Production-Ready Features
 ```python
 from tinytorch.core.spatial import Conv2D, MaxPool2D, flatten
-from tinytorch.core.layers import Dense
+from tinytorch.core.layers import Linear
 from tinytorch.core.activations import ReLU
 
 # CIFAR-10 CNN architecture
@@ -2066,13 +2071,13 @@ conv1 = Conv2D(in_channels=3, out_channels=32, kernel_size=(3, 3))
 pool1 = MaxPool2D(pool_size=(2, 2))
 conv2 = Conv2D(in_channels=32, out_channels=64, kernel_size=(3, 3))
 pool2 = MaxPool2D(pool_size=(2, 2))
-classifier = Dense(input_size=64*6*6, output_size=10)
+classifier = Linear(input_size=64*6*6, output_size=10)
 
 # Process RGB image
 rgb_image = Tensor(np.random.randn(3, 32, 32))  # CIFAR-10 format
 features1 = pool1(ReLU()(conv1(rgb_image)))     # (3,32,32) → (32,15,15)
 features2 = pool2(ReLU()(conv2(features1)))     # (32,15,15) → (64,6,6)
-predictions = classifier(flatten(features2))    # (64,6,6) → (1,10)
+predictions = classifier(flatten(features2, start_dim=0))    # (64,6,6) → (1,10)
 ```
 
 ### Next Steps
diff --git a/modules/17_quantization/quantization_dev.py b/modules/17_quantization/quantization_dev.py
index 7641e886..da7304df 100644
--- a/modules/17_quantization/quantization_dev.py
+++ b/modules/17_quantization/quantization_dev.py
@@ -63,7 +63,6 @@ from typing import Union, List, Optional, Tuple, Dict, Any
 try:
     from tinytorch.core.tensor import Tensor
     from tinytorch.core.spatial import Conv2d, MaxPool2D
-    MaxPool2d = MaxPool2D  # Alias for consistent naming
 except ImportError:
     # For development, import from local modules
     sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
@@ -71,7 +70,6 @@ except ImportError:
     try:
         from tensor_dev import Tensor
         from spatial_dev import Conv2d, MaxPool2D
-        MaxPool2d = MaxPool2D  # Alias for consistent naming
     except ImportError:
         # Create minimal mock classes if not available
         class Tensor:
diff --git a/modules/18_compression/compression_dev.py b/modules/18_compression/compression_dev.py
index 93f0e38a..a4b02374 100644
--- a/modules/18_compression/compression_dev.py
+++ b/modules/18_compression/compression_dev.py
@@ -87,8 +87,8 @@ def _determine_layer_type_and_sparsity(shape: tuple) -> Tuple[str, float]:
     if len(shape) == 4:  # Convolution: (filters, channels, height, width)
         layer_type = "Conv2D"
         recommended_sparsity = DEFAULT_CONV_SPARSITY  # Conservative - conv layers extract spatial features
-    elif len(shape) == 2:  # Dense/Linear: (output_neurons, input_neurons)  
-        layer_type = "Dense"
+    elif len(shape) == 2:  # Linear/Linear: (output_neurons, input_neurons)  
+        layer_type = "Linear"
         recommended_sparsity = DEFAULT_DENSE_SPARSITY  # Aggressive - dense layers have high redundancy
     else:
         layer_type = "Other"
@@ -175,17 +175,17 @@ def test_redundancy_analysis():
     # Create realistic CNN weights with natural sparsity
     np.random.seed(42)
     conv_weights = np.random.normal(0, 0.02, (64, 32, 3, 3))  # Conv layer
-    fc_weights = np.random.normal(0, 0.01, (1000, 512))       # FC layer
+    linear_weights = np.random.normal(0, 0.01, (1000, 512))       # Linear layer
     
     # Analyze both layer types
     conv_stats = analyze_weight_redundancy(conv_weights, "Conv2D Layer Weights")
-    fc_stats = analyze_weight_redundancy(fc_weights, "Dense Layer Weights")
+    linear_stats = analyze_weight_redundancy(linear_weights, "Linear Layer Weights")
     
     # Verify analysis produces reasonable results
     assert conv_stats['total_params'] == 64*32*3*3, "Conv param count mismatch"
-    assert fc_stats['total_params'] == 1000*512, "FC param count mismatch"
+    assert linear_stats['total_params'] == 1000*512, "Linear param count mismatch"
     assert conv_stats['natural_sparsity'] > 0, "Should detect some natural sparsity"
-    assert fc_stats['natural_sparsity'] > 0, "Should detect some natural sparsity"
+    assert linear_stats['natural_sparsity'] > 0, "Should detect some natural sparsity"
     
     print("✅ Weight redundancy analysis test passed!")
 
@@ -594,7 +594,7 @@ class SparseLinear:
             out_features: Number of output features
             
         Attributes:
-            dense_weights: Original dense weight matrix (out_features, in_features)
+            linear_weights: Original dense weight matrix (out_features, in_features)
             sparse_weights: Pruned weight matrix with zeros
             mask: Binary mask indicating kept weights (1=keep, 0=prune)
             sparsity: Fraction of weights that are zero
@@ -605,8 +605,8 @@ class SparseLinear:
         self.in_features = in_features
         self.out_features = out_features
         
-        # Dense weights (will be pruned)
-        self.dense_weights = None
+        # Linear weights (will be pruned)
+        self.linear_weights = None
         self.bias = None
         
         # Sparse representation
@@ -619,23 +619,23 @@ class SparseLinear:
         self.sparse_ops = 0
         # END SOLUTION
     
-    def load_dense_weights(self, weights: np.ndarray, bias: Optional[np.ndarray] = None):
+    def load_linear_weights(self, weights: np.ndarray, bias: Optional[np.ndarray] = None):
         """Load dense weights before pruning."""
         # BEGIN SOLUTION
         assert weights.shape == (self.out_features, self.in_features), f"Weight shape mismatch"
-        self.dense_weights = weights.copy()
+        self.linear_weights = weights.copy()
         self.bias = bias.copy() if bias is not None else np.zeros(self.out_features)
         # END SOLUTION
     
     def prune_weights(self, sparsity: float = DEFAULT_SPARSITY):
         """Prune weights using magnitude-based pruning."""
         # BEGIN SOLUTION
-        if self.dense_weights is None:
+        if self.linear_weights is None:
             raise ValueError("Must load dense weights before pruning")
         
         # Use magnitude pruner
         pruner = MagnitudePruner()
-        self.sparse_weights, self.mask, stats = pruner.prune(self.dense_weights, sparsity)
+        self.sparse_weights, self.mask, stats = pruner.prune(self.linear_weights, sparsity)
         self.sparsity = stats['actual_sparsity']
         
         print(f"✂️  Pruned {self.sparsity:.1%} of weights")
@@ -645,14 +645,14 @@ class SparseLinear:
     def forward_dense(self, x: np.ndarray) -> np.ndarray:
         """Forward pass using dense weights (reference)."""
         # BEGIN SOLUTION
-        if self.dense_weights is None:
-            raise ValueError("Dense weights not loaded")
+        if self.linear_weights is None:
+            raise ValueError("Linear weights not loaded")
         
         # Count operations
         self.dense_ops = self.in_features * self.out_features
         
         # Standard matrix multiply: y = x @ W^T + b
-        output = np.dot(x, self.dense_weights.T) + self.bias
+        output = np.dot(x, self.linear_weights.T) + self.bias
         return output
         # END SOLUTION
     
@@ -759,7 +759,7 @@ def test_sparse_neural_network():
     np.random.seed(42)
     weights = np.random.normal(0, 0.1, (128, 256))
     bias = np.random.normal(0, 0.01, 128)
-    sparse_layer.load_dense_weights(weights, bias)
+    sparse_layer.load_linear_weights(weights, bias)
     
     # Prune weights
     sparse_layer.prune_weights(sparsity=0.8)  # 80% sparsity
@@ -773,13 +773,13 @@ def test_sparse_neural_network():
     output_sparse_opt = sparse_layer.forward_sparse_optimized(x)
     
     print(f"Output shapes:")
-    print(f"  Dense: {output_dense.shape}")
+    print(f"  Linear: {output_dense.shape}")
     print(f"  Sparse naive: {output_sparse_naive.shape}")
     print(f"  Sparse optimized: {output_sparse_opt.shape}")
     
     # Verify outputs have correct shape
     expected_shape = (4, 128)
-    assert output_dense.shape == expected_shape, "Dense output shape incorrect"
+    assert output_dense.shape == expected_shape, "Linear output shape incorrect"
     assert output_sparse_naive.shape == expected_shape, "Sparse naive output shape incorrect"
     assert output_sparse_opt.shape == expected_shape, "Sparse optimized output shape incorrect"
     
@@ -801,7 +801,7 @@ def test_sparse_neural_network():
     
     print(f"\nPerformance Benchmark:")
     print(f"  Sparsity: {benchmark['sparsity']:.1%}")
-    print(f"  Dense ops: {benchmark['dense_ops']:,}")
+    print(f"  Linear ops: {benchmark['dense_ops']:,}")
     print(f"  Sparse ops: {benchmark['sparse_ops']:,}")
     print(f"  Theoretical speedup: {benchmark['theoretical_speedup']:.1f}x")
     print(f"  Actual speedup: {benchmark['actual_speedup']:.1f}x")
@@ -809,7 +809,7 @@ def test_sparse_neural_network():
     
     # Verify operation counting
     expected_dense_ops = 256 * 128
-    assert benchmark['dense_ops'] == expected_dense_ops, "Dense op count incorrect"
+    assert benchmark['dense_ops'] == expected_dense_ops, "Linear op count incorrect"
     assert benchmark['sparse_ops'] < benchmark['dense_ops'], "Sparse should use fewer ops"
     
     print("✅ Sparse neural network test passed!")
@@ -841,13 +841,13 @@ def _determine_layer_type_and_sparsity(shape: tuple) -> Tuple[str, float]:
         shape: Weight tensor shape
         
     Returns:
-        layer_type: Type of layer (Conv2D, Dense, Other)
+        layer_type: Type of layer (Conv2D, Linear, Other)
         recommended_sparsity: Recommended sparsity level for this layer type
     """
     if len(shape) == CONV2D_NDIM:  # Conv layer: (out, in, H, W)
         return "Conv2D", DEFAULT_CONV_SPARSITY
-    elif len(shape) == DENSE_NDIM:  # Dense layer: (out, in)  
-        return "Dense", DEFAULT_DENSE_SPARSITY
+    elif len(shape) == DENSE_NDIM:  # Linear layer: (out, in)  
+        return "Linear", DEFAULT_DENSE_SPARSITY
     else:
         return "Other", DEFAULT_OTHER_SPARSITY
 
@@ -980,7 +980,7 @@ class ModelCompressor:
             )
             
             analysis['total_params'] += weights.size
-            if layer_type in ['Conv2D', 'Dense']:
+            if layer_type in ['Conv2D', 'Linear']:
                 analysis['compressible_params'] += weights.size
             
             _print_layer_analysis_row(layer_name, layer_type, weights.size,
@@ -1155,8 +1155,8 @@ def test_compression_pipeline():
     model_weights = {
         'conv1': np.random.normal(0, 0.02, (32, 3, 3, 3)),    # Conv: 32 filters, 3 input channels
         'conv2': np.random.normal(0, 0.02, (64, 32, 3, 3)),   # Conv: 64 filters, 32 input channels
-        'fc1': np.random.normal(0, 0.01, (512, 1024)),        # Dense: 512 → 1024
-        'fc2': np.random.normal(0, 0.01, (10, 512)),          # Dense: 10 → 512 (output layer)
+        'linear1': np.random.normal(0, 0.01, (512, 1024)),        # Linear: 512 → 1024
+        'linear2': np.random.normal(0, 0.01, (10, 512)),          # Linear: 10 → 512 (output layer)
     }
     
     # Create compressor
@@ -1168,18 +1168,18 @@ def test_compression_pipeline():
     assert analysis['total_params'] > 0, "Should count total parameters"
     assert len(analysis['layers']) == 4, "Should analyze all 4 layers"
     assert 'conv1' in analysis['layers'], "Should analyze conv1"
-    assert 'fc1' in analysis['layers'], "Should analyze fc1"
+    assert 'linear1' in analysis['layers'], "Should analyze linear1"
     
     # Verify layer type detection
     assert analysis['layers']['conv1']['type'] == 'Conv2D', "Should detect conv layers"
-    assert analysis['layers']['fc1']['type'] == 'Dense', "Should detect dense layers"
+    assert analysis['layers']['linear1']['type'] == 'Linear', "Should detect linear layers"
     
     # Step 2: Compress model with custom sparsities
     custom_sparsities = {
         'conv1': 0.5,  # Conservative for first conv layer
         'conv2': 0.6,  # Moderate for second conv layer
-        'fc1': 0.8,    # Aggressive for large dense layer
-        'fc2': 0.3     # Conservative for output layer
+        'linear1': 0.8,    # Aggressive for large dense layer
+        'linear2': 0.3     # Conservative for output layer
     }
     
     compressed_model = compressor.compress_model(model_weights, custom_sparsities)
@@ -1262,8 +1262,8 @@ def profile_compression_memory():
     model_weights = {
         'conv1': np.random.normal(0, 0.02, (128, 64, 3, 3)),     # ~0.3M parameters
         'conv2': np.random.normal(0, 0.02, (256, 128, 3, 3)),    # ~1.2M parameters  
-        'fc1': np.random.normal(0, 0.01, (1024, 4096)),          # ~4.2M parameters
-        'fc2': np.random.normal(0, 0.01, (10, 1024)),            # ~10K parameters
+        'linear1': np.random.normal(0, 0.01, (1024, 4096)),          # ~4.2M parameters
+        'linear2': np.random.normal(0, 0.01, (10, 1024)),            # ~10K parameters
     }
     
     snapshot1 = tracemalloc.take_snapshot()
@@ -1351,13 +1351,13 @@ def analyze_deployment_scenarios():
     
     # Model sizes at different compression levels
     model_configs = [
-        {'name': 'Dense Model', 'size_mb': 200, 'gflops': 50, 'accuracy': 95.0},
+        {'name': 'Linear Model', 'size_mb': 200, 'gflops': 50, 'accuracy': 95.0},
         {'name': '50% Sparse', 'size_mb': 100, 'gflops': 25, 'accuracy': 94.5},
         {'name': '70% Sparse', 'size_mb': 60, 'gflops': 15, 'accuracy': 93.8},
         {'name': '90% Sparse', 'size_mb': 20, 'gflops': 5, 'accuracy': 91.2},
     ]
     
-    print("Scenario       | Memory | Compute | Dense | 50% | 70% | 90% | Best Option")
+    print("Scenario       | Memory | Compute | Linear | 50% | 70% | 90% | Best Option")
     print("-" * 80)
     
     for scenario in scenarios:
@@ -1435,7 +1435,7 @@ def benchmark_sparse_inference_speedup():
         
         # Load and prune weights
         weights = np.random.normal(0, 0.1, (size[1], size[0]))
-        sparse_layer.load_dense_weights(weights)
+        sparse_layer.load_linear_weights(weights)
         sparse_layer.prune_weights(sparsity)
         
         # Benchmark
@@ -1711,11 +1711,11 @@ def run_all_tests():
         np.random.seed(42)
         demo_model = {
             'backbone_conv': np.random.normal(0, 0.02, (128, 64, 3, 3)),
-            'classifier_fc': np.random.normal(0, 0.01, (10, 2048)),
+            'classifier_linear': np.random.normal(0, 0.01, (10, 2048)),
         }
         
         compressor = ModelCompressor()
-        compressed = compressor.compress_model(demo_model, {'backbone_conv': 0.7, 'classifier_fc': 0.8})
+        compressed = compressor.compress_model(demo_model, {'backbone_conv': 0.7, 'classifier_linear': 0.8})
         
         original_params = sum(w.size for w in demo_model.values())
         compressed_params = sum(np.sum(info['weights'] != 0) for info in compressed.values())
@@ -1773,7 +1773,7 @@ b) The structured vs unstructured tradeoff:
 - Inference speed: structured pruning provides actual speedup, unstructured often theoretical only
 
 c) Layer-specific sparsity tolerance:
-- Dense layers: High redundancy, many parameters, more overparametrized → tolerate 80% sparsity
+- Linear layers: High redundancy, many parameters, more overparametrized → tolerate 80% sparsity
 - Conv layers: Fewer parameters, each filter captures important spatial features → more sensitive
 - First layers: Extract low-level features (edges, textures) → very sensitive to pruning
 - Later layers: More abstract features with redundancy → can handle moderate pruning
diff --git a/modules/21_mlops/mlops_dev.py b/modules/21_mlops/mlops_dev.py
index ace87e63..8a53916d 100644
--- a/modules/21_mlops/mlops_dev.py
+++ b/modules/21_mlops/mlops_dev.py
@@ -67,7 +67,7 @@ from collections import defaultdict
 try:
     from tinytorch.core.tensor import Tensor
     from tinytorch.core.training import Trainer
-    from tinytorch.core.layers import Dense
+    from tinytorch.core.layers import Linear
 except ImportError:
     # For development, fallback gracefully
     print("⚠️  Some TinyTorch modules not available - MLOps will use mock implementations")