diff --git a/demos/demo_activations.py b/demos/demo_activations.py
index 1920b884..c5469706 100644
--- a/demos/demo_activations.py
+++ b/demos/demo_activations.py
@@ -30,6 +30,22 @@ def demo_activations():
         ))
         console.print()
         
+        # What this demo shows
+        console.print(Panel(
+            "[bold yellow]What This Demo Shows:[/bold yellow]\n\n"
+            "Activation functions are the 'secret sauce' that gives neural networks their power.\n"
+            "Without them, even deep networks would only learn linear patterns. You'll discover:\n\n"
+            "• Why linear transformations fail on the famous XOR problem\n"
+            "• How ReLU creates sparse, learnable features from data\n"
+            "• How Softmax converts raw scores into probabilities for classification\n"
+            "• The complete forward pass through a neural network\n\n"
+            "[bold cyan]Key Insight:[/bold cyan] Nonlinearity allows networks to learn complex decision boundaries\n"
+            "that can separate any data pattern, not just straight lines!",
+            title="📚 Understanding This Demo",
+            style="blue"
+        ))
+        console.print()
+        
         # Demo 1: Function shapes visualization
         console.print(Panel(
             "Comparing linear vs nonlinear transformations...",
@@ -68,6 +84,11 @@ def demo_activations():
         console.print(activation_table)
         console.print()
         
+        console.print("[dim]💡 [bold]How to Interpret:[/bold] Each activation function shapes data differently:[/dim]")
+        console.print("[dim]   • ReLU: Keeps positive values, zeros out negatives (creates sparsity)[/dim]")
+        console.print("[dim]   • Sigmoid: Squashes any input to (0,1) range (good for probabilities)[/dim]")
+        console.print()
+        
         # Demo 2: The XOR Problem Setup
         console.print(Panel(
             "Showing why we NEED nonlinear activations...",
diff --git a/demos/demo_attention.py b/demos/demo_attention.py
index 61e2443a..563a7803 100644
--- a/demos/demo_attention.py
+++ b/demos/demo_attention.py
@@ -34,6 +34,23 @@ def demo_attention():
         ))
         console.print()
         
+        # What this demo shows
+        console.print(Panel(
+            "[bold yellow]What This Demo Shows:[/bold yellow]\n\n"
+            "Attention mechanisms solved the fundamental problem of sequence processing - how to let\n"
+            "any part of a sequence directly access information from any other part. You'll discover:\n\n"
+            "• Why RNNs failed on long sequences - the information bottleneck problem\n"
+            "• How attention enables direct connections between all sequence positions\n"
+            "• The elegant math behind attention: Query, Key, Value operations\n"
+            "• Why multi-head attention gives different types of understanding\n"
+            "• How Transformers stack attention layers to build deep understanding\n\n"
+            "[bold cyan]Key Insight:[/bold cyan] Attention is about letting the model decide what to focus on,\n"
+            "instead of forcing it through fixed computation patterns. This flexibility is why it works!",
+            title="📚 Understanding This Demo",
+            style="blue"
+        ))
+        console.print()
+        
         # Demo 1: The Attention Problem
         console.print(Panel(
             "From fixed-size bottlenecks to dynamic focus...",
@@ -75,6 +92,11 @@ def demo_attention():
         console.print(comparison_table)
         console.print()
         
+        console.print("[dim]💡 [bold]Key Difference:[/bold] RNNs process sequentially, attention processes in parallel:[/dim]")
+        console.print("[dim]   • RNN: Must go through h3 to connect 'cat' and 'mat' (loses information)[/dim]")
+        console.print("[dim]   • Attention: 'cat' and 'mat' can directly interact (preserves all information)[/dim]")
+        console.print()
+        
         # Problems and solutions
         problems_panel = Panel(
             "❌ Problem: h6 must encode ALL previous information!\n❌ Result: Information loss, especially for long sequences",
@@ -127,6 +149,12 @@ def demo_attention():
             print(f"  'sat' → '{word}': {score:.3f}")
         print()
         
+        console.print("[dim]💡 [bold]Understanding Scores:[/bold] Higher scores = stronger relationships:[/dim]")
+        console.print("[dim]   • Dot product measures similarity between embeddings[/dim]")
+        console.print("[dim]   • Similar vectors have high dot products[/dim]")
+        console.print("[dim]   • These raw scores will be normalized with softmax[/dim]")
+        console.print()
+        
         # Softmax to get attention weights
         exp_scores = np.exp(scores)
         attention_weights = exp_scores / np.sum(exp_scores)
@@ -137,6 +165,12 @@ def demo_attention():
         print(f"Total: {np.sum(attention_weights):.3f}")
         print()
         
+        console.print("[dim]💡 [bold]Weights Interpretation:[/bold] Softmax creates a probability distribution:[/dim]")
+        console.print("[dim]   • All weights sum to 1.0 (100%)[/dim]")
+        console.print("[dim]   • Higher weights = more attention/importance[/dim]")
+        console.print("[dim]   • The model learns what to pay attention to![/dim]")
+        console.print()
+        
         # Compute attended output
         attended_output = np.sum(keys * attention_weights.reshape(-1, 1), axis=0)
         print(f"Attended output for 'sat': {attended_output}")
@@ -173,6 +207,13 @@ def demo_attention():
         print("💡 Key insight: Different heads learn different types of relationships!")
         print()
         
+        console.print("[dim]💡 [bold]Multi-Head Benefits:[/bold] Like having multiple experts:[/dim]")
+        console.print("[dim]   • One head might focus on grammar (subject-verb)[/dim]")
+        console.print("[dim]   • Another on semantics (cat-mat are both objects)[/dim]")
+        console.print("[dim]   • Another on position (nearby words)[/dim]")
+        console.print("[dim]   • Combined: Rich, multi-faceted understanding![/dim]")
+        console.print()
+        
         # Demo 4: Self-Attention in Practice
         print("🎭 Demo 4: Self-Attention - Words Talking to Each Other")
         print("Every word attends to every other word...")
@@ -202,6 +243,12 @@ def demo_attention():
         print("  • 'mat' balances between all words")
         print()
         
+        console.print("[dim]💡 [bold]Self-Attention Patterns:[/bold] Different words have different focus patterns:[/dim]")
+        console.print("[dim]   • Content words (nouns/verbs) often have high self-attention[/dim]")
+        console.print("[dim]   • Function words distribute attention more broadly[/dim]")
+        console.print("[dim]   • These patterns emerge automatically during training![/dim]")
+        console.print()
+        
         # Demo 5: Scaled Dot-Product Attention
         console.print(Panel(
             "The mathematical foundation of modern AI",
@@ -256,6 +303,13 @@ Where:
         
         print()
         
+        console.print("[dim]💡 [bold]The Magic Formula:[/bold] Why this simple equation changed AI:[/dim]")
+        console.print("[dim]   • Q⋅Kᵀ: Measures relevance between positions[/dim]")
+        console.print("[dim]   • √dₖ scaling: Prevents gradient problems in deep networks[/dim]")
+        console.print("[dim]   • Softmax: Creates sharp, interpretable attention patterns[/dim]")
+        console.print("[dim]   • ×V: Retrieves weighted information from relevant positions[/dim]")
+        console.print()
+        
         # Demo 6: Transformer Architecture Preview
         console.print(Panel(
             "How attention enables modern language models...",
@@ -308,6 +362,12 @@ Where:
         console.print(why_table)
         console.print()
         
+        console.print("[dim]💡 [bold]Architecture Power:[/bold] Each component has a critical role:[/dim]")
+        console.print("[dim]   • Residual connections: Allow 100+ layer deep networks[/dim]")
+        console.print("[dim]   • Layer norm: Stabilizes training of very deep models[/dim]")
+        console.print("[dim]   • Feed-forward: Adds computation power beyond attention[/dim]")
+        console.print()
+        
         # Demo 7: Real-World Applications
         print("🌍 Demo 7: Real-World Impact")
         print("Where attention mechanisms changed everything...")
@@ -352,6 +412,12 @@ Where:
         
         print()
         
+        console.print("[dim]💡 [bold]Scaling Challenge:[/bold] Why context windows are limited:[/dim]")
+        console.print("[dim]   • Attention is O(n²) - quadratic in sequence length[/dim]")
+        console.print("[dim]   • This is why GPT models have token limits (4k, 8k, 32k, etc.)[/dim]")
+        console.print("[dim]   • Active research: Efficient attention for longer sequences[/dim]")
+        console.print()
+        
         # Success summary
         console.print(Panel.fit(
             "🎯 Achievements:\n"
diff --git a/demos/demo_language.py b/demos/demo_language.py
index 230d6605..f232ec60 100644
--- a/demos/demo_language.py
+++ b/demos/demo_language.py
@@ -32,6 +32,23 @@ def demo_language():
         ))
         console.print()
         
+        # What this demo shows
+        console.print(Panel(
+            "[bold yellow]What This Demo Shows:[/bold yellow]\n\n"
+            "Language generation is the culmination of everything you've learned - combining all the\n"
+            "components into a system that can understand and generate human language. You'll discover:\n\n"
+            "• How text is tokenized into discrete units the model can process\n"
+            "• Why embeddings convert discrete words into continuous vector spaces\n"
+            "• How autoregressive generation produces text one token at a time\n"
+            "• The complete TinyGPT architecture - your own language AI\n"
+            "• How scaling from TinyGPT to GPT-4 unlocks emergent capabilities\n\n"
+            "[bold cyan]Key Insight:[/bold cyan] Language modeling is just predicting the next word - but when done\n"
+            "at scale with transformers, this simple task creates intelligent behavior!",
+            title="📚 Understanding This Demo",
+            style="blue"
+        ))
+        console.print()
+        
         # Demo 1: The Language Modeling Challenge
         print("📚 Demo 1: Understanding Language Generation")
         print("From discrete tokens to continuous predictions...")
@@ -61,6 +78,12 @@ def demo_language():
         print("  Challenge: Capture grammar, semantics, and context!")
         print()
         
+        console.print("[dim]💡 [bold]Core Concept:[/bold] Language modeling = next word prediction:[/dim]")
+        console.print("[dim]   • Each word depends on all previous words (context)[/dim]")
+        console.print("[dim]   • The model outputs probabilities for all possible next words[/dim]")
+        console.print("[dim]   • Training teaches which words are likely to follow others[/dim]")
+        console.print()
+        
         # Demo 2: Token Embeddings
         print("🔤 Demo 2: Token Embeddings - Words as Vectors")
         print("Converting discrete tokens to continuous representations...")
@@ -86,6 +109,12 @@ def demo_language():
         print("(This is learned during training)")
         print()
         
+        console.print("[dim]💡 [bold]Embedding Space:[/bold] Words become points in high-dimensional space:[/dim]")
+        console.print("[dim]   • 'cat' and 'dog' should be nearby (both animals)[/dim]")
+        console.print("[dim]   • 'ran' and 'walked' should be nearby (both movement verbs)[/dim]")
+        console.print("[dim]   • Vector arithmetic works: king - man + woman ≈ queen[/dim]")
+        console.print()
+        
         # Demo 3: Sequence Processing
         print("📝 Demo 3: Sequence Processing with Attention")
         print("How transformers understand context...")
@@ -120,6 +149,13 @@ def demo_language():
         print("  • 'sat' focuses on 'cat' (what the cat did)")
         print()
         
+        console.print("[dim]💡 [bold]Attention in Language:[/bold] Words 'look back' at relevant context:[/dim]")
+        console.print("[dim]   • Verbs attend to their subjects[/dim]")
+        console.print("[dim]   • Pronouns attend to their antecedents[/dim]")
+        console.print("[dim]   • Adjectives attend to their nouns[/dim]")
+        console.print("[dim]   These patterns emerge automatically during training![/dim]")
+        console.print()
+        
         # Demo 4: TinyGPT Architecture
         print("🧠 Demo 4: TinyGPT Architecture")
         print("Complete transformer model for text generation...")
@@ -198,6 +234,13 @@ def demo_language():
         print(f"Generated text: '{final_text}'")
         print()
         
+        console.print("[dim]💡 [bold]Generation Strategy:[/bold] Different sampling methods produce different text:[/dim]")
+        console.print("[dim]   • Greedy: Always pick highest probability (deterministic, repetitive)[/dim]")
+        console.print("[dim]   • Temperature sampling: Adjust probability sharpness (creativity control)[/dim]")
+        console.print("[dim]   • Top-k: Sample from top k most likely tokens (balanced)[/dim]")
+        console.print("[dim]   • Nucleus (top-p): Sample from smallest set with cumulative p (adaptive)[/dim]")
+        console.print()
+        
         # Demo 6: Autoregressive Generation
         print("🔄 Demo 6: Autoregressive Generation")
         print("Why we generate one token at a time...")
@@ -222,6 +265,12 @@ def demo_language():
         print("  • Enables controllable generation")
         print()
         
+        console.print("[dim]💡 [bold]Mathematical Foundation:[/bold] Chain rule of probability:[/dim]")
+        console.print("[dim]   • Decomposes joint probability into conditional probabilities[/dim]")
+        console.print("[dim]   • Each token depends on entire history[/dim]")
+        console.print("[dim]   • This is why transformers need attention - to see all history![/dim]")
+        console.print()
+        
         # Demo 7: Training vs Inference
         print("🎓 Demo 7: Training vs Inference")
         print("Different processes for learning vs generating...")
@@ -261,6 +310,13 @@ def demo_language():
             print(f"  {name}: {vocab} vocab, {dims}, {layers} → {capability}")
         
         print()
+        
+        console.print("[dim]💡 [bold]Scaling Laws:[/bold] Bigger models are qualitatively different:[/dim]")
+        console.print("[dim]   • 10× parameters ≈ predictable performance gain[/dim]")
+        console.print("[dim]   • Emergent abilities appear at scale thresholds[/dim]")
+        console.print("[dim]   • In-context learning emerges around 1B parameters[/dim]")
+        console.print("[dim]   • Reasoning emerges around 100B parameters[/dim]")
+        console.print()
         print("Emergent capabilities with scale:")
         print("  • Few-shot learning (learn from examples)")
         print("  • Chain-of-thought reasoning")
@@ -333,6 +389,13 @@ def demo_language():
         print("This knowledge will serve you in any AI/ML career path.")
         print()
         
+        console.print("[dim]💡 [bold]Your Achievement:[/bold] You've built every component of modern AI:[/dim]")
+        console.print("[dim]   • You understand the math (tensors, gradients, optimization)[/dim]")
+        console.print("[dim]   • You understand the engineering (memory, compute, scaling)[/dim]")
+        console.print("[dim]   • You understand the systems (training, deployment, monitoring)[/dim]")
+        console.print("[dim]   • Most importantly: You built it all yourself![/dim]")
+        console.print()
+        
         print("🏆 TinyTorch Language Generation Demo Complete!")
         print("🎯 Final Achievements:")
         print("  • Understood language modeling as a prediction task")
diff --git a/demos/demo_single_neuron.py b/demos/demo_single_neuron.py
index cb172beb..4acc0e18 100644
--- a/demos/demo_single_neuron.py
+++ b/demos/demo_single_neuron.py
@@ -31,9 +31,26 @@ def demo_single_neuron():
         ))
         console.print()
         
+        # What this demo shows
+        console.print(Panel(
+            "[bold yellow]What This Demo Shows:[/bold yellow]\n\n"
+            "We're going to watch a single neuron (the basic unit of neural networks) learn to solve\n"
+            "the AND gate problem through gradient descent. You'll see:\n\n"
+            "• How random weights produce wrong answers initially\n"
+            "• How the neuron adjusts its weights based on errors\n"
+            "• The formation of a decision boundary that separates 0s from 1s\n"
+            "• Why some problems (AND) are learnable while others (XOR) need multiple layers\n\n"
+            "[bold cyan]Key Insight:[/bold cyan] A neuron is just a weighted sum followed by an activation function.\n"
+            "Learning means finding the right weights!",
+            title="📚 Understanding This Demo",
+            style="blue"
+        ))
+        console.print()
+        
         # Demo 1: The AND gate problem
         console.print(Panel(
-            "Teaching a neuron digital logic...",
+            "The AND gate outputs 1 only when BOTH inputs are 1.\n"
+            "This is a 'linearly separable' problem - a single line can divide the outputs.",
             title="⚡ Demo 1: The AND Gate Learning Problem",
             style="green"
         ))
@@ -56,6 +73,10 @@ def demo_single_neuron():
         console.print(and_table)
         console.print()
         
+        console.print("[dim]💡 [bold]How to Read This:[/bold] The AND gate is like a logical 'both must be true' operator.[/dim]")
+        console.print("[dim]   Notice only the last row (1 AND 1) outputs 1. Our neuron needs to learn this pattern![/dim]")
+        console.print()
+        
         # Demo 2: Manual neuron implementation
         console.print(Panel(
             "Understanding: output = sigmoid(w1*x1 + w2*x2 + bias)",
@@ -161,6 +182,12 @@ def demo_single_neuron():
         console.print(training_table)
         console.print()
         
+        console.print("[dim]💡 [bold]What's Happening:[/bold] Watch the error decrease as the neuron learns![/dim]")
+        console.print("[dim]   • Error measures how wrong our predictions are (lower is better)[/dim]")
+        console.print("[dim]   • Weights are adjusting to reduce this error through gradient descent[/dim]")
+        console.print("[dim]   • The bias shifts the decision boundary position[/dim]")
+        console.print()
+        
         # Final predictions
         console.print("[bold green]🎯 Final Results After Training:[/bold green]")
         z_final = tt.Tensor(X.data @ weights.data + bias.data)
diff --git a/demos/demo_tensor_math.py b/demos/demo_tensor_math.py
index 297f99d3..3335e52a 100644
--- a/demos/demo_tensor_math.py
+++ b/demos/demo_tensor_math.py
@@ -28,6 +28,22 @@ def demo_tensor_math():
         ))
         console.print()
         
+        # What this demo shows
+        console.print(Panel(
+            "[bold yellow]What This Demo Shows:[/bold yellow]\n\n"
+            "Tensors are the foundation of all neural networks - they're just multi-dimensional arrays\n"
+            "that can represent scalars, vectors, matrices, and higher dimensions. You'll see:\n\n"
+            "• Solving systems of linear equations (finding x in Ax = b)\n"
+            "• Geometric transformations with rotation matrices\n"
+            "• Batch processing - operating on multiple data points simultaneously\n"
+            "• How neural network weights are just matrices doing transformations\n\n"
+            "[bold cyan]Key Insight:[/bold cyan] Every neural network operation is matrix multiplication at its core.\n"
+            "Understanding tensors means understanding how neural networks compute!",
+            title="📚 Understanding This Demo",
+            style="blue"
+        ))
+        console.print()
+        
         # Demo 1: Solve system of linear equations
         console.print(Panel(
             "System: 2x + 3y = 13\n        1x + 1y = 5",
@@ -74,6 +90,10 @@ def demo_tensor_math():
         console.print(verify_table)
         console.print()
         
+        console.print("[dim]💡 [bold]What Just Happened:[/bold] We solved for x=2, y=3 using matrix operations![/dim]")
+        console.print("[dim]   This is exactly how neural networks solve for optimal weights during training.[/dim]")
+        console.print()
+        
         # Demo 2: Matrix transformation (rotation)
         console.print(Panel(
             "Rotating point (1, 0) by 45°...",
diff --git a/demos/demo_training.py b/demos/demo_training.py
index 39c77148..34bcdbf9 100644
--- a/demos/demo_training.py
+++ b/demos/demo_training.py
@@ -34,6 +34,22 @@ def demo_training():
         ))
         console.print()
         
+        # What this demo shows
+        console.print(Panel(
+            "[bold yellow]What This Demo Shows:[/bold yellow]\n\n"
+            "This is where everything comes together - a complete training pipeline that takes\n"
+            "random weights and produces a working classifier. You'll witness:\n\n"
+            "• Data preparation and batching for efficient training\n"
+            "• The training loop: forward pass → loss calculation → backpropagation\n"
+            "• Real-time learning progress with loss and accuracy metrics\n"
+            "• Model evaluation and deployment considerations\n\n"
+            "[bold cyan]Key Insight:[/bold cyan] Training is an optimization process - we iteratively adjust weights\n"
+            "to minimize prediction errors. Watch the loss decrease and accuracy increase!",
+            title="📚 Understanding This Demo",
+            style="blue"
+        ))
+        console.print()
+        
         # Demo 1: The Training Problem
         print("🎯 Demo 1: The Machine Learning Training Challenge")
         print("From random weights to intelligent behavior...")
diff --git a/demos/demo_vision.py b/demos/demo_vision.py
index 11dd436c..5dce020c 100644
--- a/demos/demo_vision.py
+++ b/demos/demo_vision.py
@@ -31,6 +31,23 @@ def demo_vision():
         ))
         console.print()
         
+        # What this demo shows
+        console.print(Panel(
+            "[bold yellow]What This Demo Shows:[/bold yellow]\n\n"
+            "Convolutional neural networks (CNNs) revolutionized computer vision by learning to detect\n"
+            "visual patterns hierarchically. You'll understand:\n\n"
+            "• How digital images are just 2D arrays of numbers (tensors)\n"
+            "• How convolution operations scan images to detect local patterns\n"
+            "• Why edge detection is fundamental - edges define object boundaries\n"
+            "• How multiple filters create different 'views' of the same image\n"
+            "• Why CNNs build hierarchical features: edges → textures → shapes → objects\n\n"
+            "[bold cyan]Key Insight:[/bold cyan] CNNs automatically learn which patterns matter for your task.\n"
+            "Early layers detect simple edges, deeper layers combine them into complex features!",
+            title="📚 Understanding This Demo",
+            style="blue"
+        ))
+        console.print()
+        
         # Demo 1: The Image Processing Foundation
         print("🖼️ Demo 1: Digital Images as Tensors")
         print("Understanding how computers see...")
@@ -54,6 +71,12 @@ def demo_vision():
         print(f"Pixel values: {np.unique(image.data)} (0=black, 1=white)")
         print()
         
+        console.print("[dim]💡 [bold]How to Read This:[/bold] Each symbol represents a pixel value:[/dim]")
+        console.print("[dim]   • █ = 1 (white/bright pixel), · = 0 (black/dark pixel)[/dim]")
+        console.print("[dim]   • This diamond pattern is what the computer 'sees' as numbers[/dim]")
+        console.print("[dim]   • Real images have values 0-255, but the principle is the same[/dim]")
+        console.print()
+        
         # Demo 2: Edge Detection - Computer Vision's Foundation
         print("🔍 Demo 2: Edge Detection - How Computers Find Shapes")
         print("Using convolution to detect edges...")
@@ -91,6 +114,13 @@ def demo_vision():
             print("  " + " ".join(f"{val:2.0f}" for val in row))
         print()
         
+        console.print("[dim]💡 [bold]Interpreting Edge Detection:[/bold] The numbers show edge strength:[/dim]")
+        console.print("[dim]   • Positive values = bright-to-dark transitions[/dim]")
+        console.print("[dim]   • Negative values = dark-to-bright transitions[/dim]")
+        console.print("[dim]   • Zero = no edge (uniform area)[/dim]")
+        console.print("[dim]   • Larger absolute values = stronger edges[/dim]")
+        console.print()
+        
         # Combine edges
         edge_magnitude = tt.Tensor(np.sqrt(edge_x**2 + edge_y**2))
         print("Combined edge magnitude:")
@@ -127,6 +157,12 @@ def demo_vision():
             print("  " + " ".join(f"{val:2.0f}" for val in row))
         print()
         
+        console.print("[dim]💡 [bold]Understanding Feature Detection:[/bold] Each filter learns to detect specific patterns:[/dim]")
+        console.print("[dim]   • High positive values = strong match to the pattern[/dim]")
+        console.print("[dim]   • Near zero = pattern not present[/dim]")
+        console.print("[dim]   • In real CNNs, hundreds of filters learn different features automatically[/dim]")
+        console.print()
+        
         # Demo 4: Multi-layer Feature Extraction
         print("🏗️ Demo 4: Deep Feature Extraction")
         print("Building feature hierarchy like real CNNs...")
@@ -147,6 +183,13 @@ def demo_vision():
         print("  Input(5×5) → Conv2D(3×3) → ReLU → Flatten → Dense(9→5) → ReLU → Dense(5→1) → Sigmoid")
         print()
         
+        console.print("[dim]💡 [bold]Architecture Flow:[/bold] Data transforms through the network:[/dim]")
+        console.print("[dim]   • Conv2D: Extracts spatial features (edges, corners)[/dim]")
+        console.print("[dim]   • ReLU: Adds nonlinearity for complex patterns[/dim]")
+        console.print("[dim]   • Flatten: Converts 2D features to 1D for classification[/dim]")
+        console.print("[dim]   • Dense layers: Combine features for final decision[/dim]")
+        console.print()
+        
         # Set known good weights for demonstration
         cnn.layers[0].kernel = corner_kernel.data  # Use corner detector
         
@@ -183,6 +226,12 @@ def demo_vision():
         print("  Total: ~5 MB parameters + activations")
         print()
         
+        console.print("[dim]💡 [bold]Scaling Insights:[/bold] Notice how parameters grow:[/dim]")
+        console.print("[dim]   • Conv layers: Few parameters but powerful feature extraction[/dim]")
+        console.print("[dim]   • Dense layers: Most parameters are here (fully connected)[/dim]")
+        console.print("[dim]   • This is why modern CNNs minimize dense layers![/dim]")
+        console.print()
+        
         # Demo 6: Feature Visualization
         print("👁️ Demo 6: What CNNs Actually Learn")
         print("Visualizing learned features...")
@@ -229,6 +278,12 @@ def demo_vision():
         
         print()
         
+        console.print("[dim]💡 [bold]Learning Process:[/bold] CNNs discover features automatically:[/dim]")
+        console.print("[dim]   • No need to hand-design edge detectors[/dim]")
+        console.print("[dim]   • The network learns what patterns matter for your task[/dim]")
+        console.print("[dim]   • Different tasks learn different features from same architecture![/dim]")
+        console.print()
+        
         print("🏆 TinyTorch Computer Vision Demo Complete!")
         print("🎯 Achievements:")
         print("  • Processed images as numerical tensors")
diff --git a/demos/demo_xor_network.py b/demos/demo_xor_network.py
index 2d422eca..b0776811 100644
--- a/demos/demo_xor_network.py
+++ b/demos/demo_xor_network.py
@@ -32,6 +32,22 @@ def demo_xor_network():
         ))
         console.print()
         
+        # What this demo shows
+        console.print(Panel(
+            "[bold yellow]What This Demo Shows:[/bold yellow]\n\n"
+            "The XOR problem is the classic example that proved we need multi-layer networks.\n"
+            "A single neuron cannot solve XOR, but two layers can! You'll understand:\n\n"
+            "• Why XOR is 'not linearly separable' (no single line works)\n"
+            "• How hidden layers create intermediate features that ARE separable\n"
+            "• The power of depth in neural networks - each layer transforms the problem\n"
+            "• How modern deep learning builds on this multi-layer principle\n\n"
+            "[bold cyan]Key Insight:[/bold cyan] Hidden layers transform the input space into a new representation\n"
+            "where previously impossible problems become solvable!",
+            title="📚 Understanding This Demo",
+            style="blue"
+        ))
+        console.print()
+        
         # Demo 1: The XOR problem setup
         console.print(Panel(
             "Why single neurons fail and multi-layer networks succeed...",