diff --git a/demos/demo_activations.py b/demos/demo_activations.py index 1920b884..c5469706 100644 --- a/demos/demo_activations.py +++ b/demos/demo_activations.py @@ -30,6 +30,22 @@ def demo_activations(): )) console.print() + # What this demo shows + console.print(Panel( + "[bold yellow]What This Demo Shows:[/bold yellow]\n\n" + "Activation functions are the 'secret sauce' that gives neural networks their power.\n" + "Without them, even deep networks would only learn linear patterns. You'll discover:\n\n" + "• Why linear transformations fail on the famous XOR problem\n" + "• How ReLU creates sparse, learnable features from data\n" + "• How Softmax converts raw scores into probabilities for classification\n" + "• The complete forward pass through a neural network\n\n" + "[bold cyan]Key Insight:[/bold cyan] Nonlinearity allows networks to learn complex decision boundaries\n" + "that can separate any data pattern, not just straight lines!", + title="📚 Understanding This Demo", + style="blue" + )) + console.print() + # Demo 1: Function shapes visualization console.print(Panel( "Comparing linear vs nonlinear transformations...", @@ -68,6 +84,11 @@ def demo_activations(): console.print(activation_table) console.print() + console.print("[dim]💡 [bold]How to Interpret:[/bold] Each activation function shapes data differently:[/dim]") + console.print("[dim] • ReLU: Keeps positive values, zeros out negatives (creates sparsity)[/dim]") + console.print("[dim] • Sigmoid: Squashes any input to (0,1) range (good for probabilities)[/dim]") + console.print() + # Demo 2: The XOR Problem Setup console.print(Panel( "Showing why we NEED nonlinear activations...", diff --git a/demos/demo_attention.py b/demos/demo_attention.py index 61e2443a..563a7803 100644 --- a/demos/demo_attention.py +++ b/demos/demo_attention.py @@ -34,6 +34,23 @@ def demo_attention(): )) console.print() + # What this demo shows + console.print(Panel( + "[bold yellow]What This Demo Shows:[/bold yellow]\n\n" + "Attention mechanisms solved the fundamental problem of sequence processing - how to let\n" + "any part of a sequence directly access information from any other part. You'll discover:\n\n" + "• Why RNNs failed on long sequences - the information bottleneck problem\n" + "• How attention enables direct connections between all sequence positions\n" + "• The elegant math behind attention: Query, Key, Value operations\n" + "• Why multi-head attention gives different types of understanding\n" + "• How Transformers stack attention layers to build deep understanding\n\n" + "[bold cyan]Key Insight:[/bold cyan] Attention is about letting the model decide what to focus on,\n" + "instead of forcing it through fixed computation patterns. This flexibility is why it works!", + title="📚 Understanding This Demo", + style="blue" + )) + console.print() + # Demo 1: The Attention Problem console.print(Panel( "From fixed-size bottlenecks to dynamic focus...", @@ -75,6 +92,11 @@ def demo_attention(): console.print(comparison_table) console.print() + console.print("[dim]💡 [bold]Key Difference:[/bold] RNNs process sequentially, attention processes in parallel:[/dim]") + console.print("[dim] • RNN: Must go through h3 to connect 'cat' and 'mat' (loses information)[/dim]") + console.print("[dim] • Attention: 'cat' and 'mat' can directly interact (preserves all information)[/dim]") + console.print() + # Problems and solutions problems_panel = Panel( "❌ Problem: h6 must encode ALL previous information!\n❌ Result: Information loss, especially for long sequences", @@ -127,6 +149,12 @@ def demo_attention(): print(f" 'sat' → '{word}': {score:.3f}") print() + console.print("[dim]💡 [bold]Understanding Scores:[/bold] Higher scores = stronger relationships:[/dim]") + console.print("[dim] • Dot product measures similarity between embeddings[/dim]") + console.print("[dim] • Similar vectors have high dot products[/dim]") + console.print("[dim] • These raw scores will be normalized with softmax[/dim]") + console.print() + # Softmax to get attention weights exp_scores = np.exp(scores) attention_weights = exp_scores / np.sum(exp_scores) @@ -137,6 +165,12 @@ def demo_attention(): print(f"Total: {np.sum(attention_weights):.3f}") print() + console.print("[dim]💡 [bold]Weights Interpretation:[/bold] Softmax creates a probability distribution:[/dim]") + console.print("[dim] • All weights sum to 1.0 (100%)[/dim]") + console.print("[dim] • Higher weights = more attention/importance[/dim]") + console.print("[dim] • The model learns what to pay attention to![/dim]") + console.print() + # Compute attended output attended_output = np.sum(keys * attention_weights.reshape(-1, 1), axis=0) print(f"Attended output for 'sat': {attended_output}") @@ -173,6 +207,13 @@ def demo_attention(): print("💡 Key insight: Different heads learn different types of relationships!") print() + console.print("[dim]💡 [bold]Multi-Head Benefits:[/bold] Like having multiple experts:[/dim]") + console.print("[dim] • One head might focus on grammar (subject-verb)[/dim]") + console.print("[dim] • Another on semantics (cat-mat are both objects)[/dim]") + console.print("[dim] • Another on position (nearby words)[/dim]") + console.print("[dim] • Combined: Rich, multi-faceted understanding![/dim]") + console.print() + # Demo 4: Self-Attention in Practice print("🎭 Demo 4: Self-Attention - Words Talking to Each Other") print("Every word attends to every other word...") @@ -202,6 +243,12 @@ def demo_attention(): print(" • 'mat' balances between all words") print() + console.print("[dim]💡 [bold]Self-Attention Patterns:[/bold] Different words have different focus patterns:[/dim]") + console.print("[dim] • Content words (nouns/verbs) often have high self-attention[/dim]") + console.print("[dim] • Function words distribute attention more broadly[/dim]") + console.print("[dim] • These patterns emerge automatically during training![/dim]") + console.print() + # Demo 5: Scaled Dot-Product Attention console.print(Panel( "The mathematical foundation of modern AI", @@ -256,6 +303,13 @@ Where: print() + console.print("[dim]💡 [bold]The Magic Formula:[/bold] Why this simple equation changed AI:[/dim]") + console.print("[dim] • Q⋅Kᵀ: Measures relevance between positions[/dim]") + console.print("[dim] • √dₖ scaling: Prevents gradient problems in deep networks[/dim]") + console.print("[dim] • Softmax: Creates sharp, interpretable attention patterns[/dim]") + console.print("[dim] • ×V: Retrieves weighted information from relevant positions[/dim]") + console.print() + # Demo 6: Transformer Architecture Preview console.print(Panel( "How attention enables modern language models...", @@ -308,6 +362,12 @@ Where: console.print(why_table) console.print() + console.print("[dim]💡 [bold]Architecture Power:[/bold] Each component has a critical role:[/dim]") + console.print("[dim] • Residual connections: Allow 100+ layer deep networks[/dim]") + console.print("[dim] • Layer norm: Stabilizes training of very deep models[/dim]") + console.print("[dim] • Feed-forward: Adds computation power beyond attention[/dim]") + console.print() + # Demo 7: Real-World Applications print("🌍 Demo 7: Real-World Impact") print("Where attention mechanisms changed everything...") @@ -352,6 +412,12 @@ Where: print() + console.print("[dim]💡 [bold]Scaling Challenge:[/bold] Why context windows are limited:[/dim]") + console.print("[dim] • Attention is O(n²) - quadratic in sequence length[/dim]") + console.print("[dim] • This is why GPT models have token limits (4k, 8k, 32k, etc.)[/dim]") + console.print("[dim] • Active research: Efficient attention for longer sequences[/dim]") + console.print() + # Success summary console.print(Panel.fit( "🎯 Achievements:\n" diff --git a/demos/demo_language.py b/demos/demo_language.py index 230d6605..f232ec60 100644 --- a/demos/demo_language.py +++ b/demos/demo_language.py @@ -32,6 +32,23 @@ def demo_language(): )) console.print() + # What this demo shows + console.print(Panel( + "[bold yellow]What This Demo Shows:[/bold yellow]\n\n" + "Language generation is the culmination of everything you've learned - combining all the\n" + "components into a system that can understand and generate human language. You'll discover:\n\n" + "• How text is tokenized into discrete units the model can process\n" + "• Why embeddings convert discrete words into continuous vector spaces\n" + "• How autoregressive generation produces text one token at a time\n" + "• The complete TinyGPT architecture - your own language AI\n" + "• How scaling from TinyGPT to GPT-4 unlocks emergent capabilities\n\n" + "[bold cyan]Key Insight:[/bold cyan] Language modeling is just predicting the next word - but when done\n" + "at scale with transformers, this simple task creates intelligent behavior!", + title="📚 Understanding This Demo", + style="blue" + )) + console.print() + # Demo 1: The Language Modeling Challenge print("📚 Demo 1: Understanding Language Generation") print("From discrete tokens to continuous predictions...") @@ -61,6 +78,12 @@ def demo_language(): print(" Challenge: Capture grammar, semantics, and context!") print() + console.print("[dim]💡 [bold]Core Concept:[/bold] Language modeling = next word prediction:[/dim]") + console.print("[dim] • Each word depends on all previous words (context)[/dim]") + console.print("[dim] • The model outputs probabilities for all possible next words[/dim]") + console.print("[dim] • Training teaches which words are likely to follow others[/dim]") + console.print() + # Demo 2: Token Embeddings print("🔤 Demo 2: Token Embeddings - Words as Vectors") print("Converting discrete tokens to continuous representations...") @@ -86,6 +109,12 @@ def demo_language(): print("(This is learned during training)") print() + console.print("[dim]💡 [bold]Embedding Space:[/bold] Words become points in high-dimensional space:[/dim]") + console.print("[dim] • 'cat' and 'dog' should be nearby (both animals)[/dim]") + console.print("[dim] • 'ran' and 'walked' should be nearby (both movement verbs)[/dim]") + console.print("[dim] • Vector arithmetic works: king - man + woman ≈ queen[/dim]") + console.print() + # Demo 3: Sequence Processing print("📝 Demo 3: Sequence Processing with Attention") print("How transformers understand context...") @@ -120,6 +149,13 @@ def demo_language(): print(" • 'sat' focuses on 'cat' (what the cat did)") print() + console.print("[dim]💡 [bold]Attention in Language:[/bold] Words 'look back' at relevant context:[/dim]") + console.print("[dim] • Verbs attend to their subjects[/dim]") + console.print("[dim] • Pronouns attend to their antecedents[/dim]") + console.print("[dim] • Adjectives attend to their nouns[/dim]") + console.print("[dim] These patterns emerge automatically during training![/dim]") + console.print() + # Demo 4: TinyGPT Architecture print("🧠 Demo 4: TinyGPT Architecture") print("Complete transformer model for text generation...") @@ -198,6 +234,13 @@ def demo_language(): print(f"Generated text: '{final_text}'") print() + console.print("[dim]💡 [bold]Generation Strategy:[/bold] Different sampling methods produce different text:[/dim]") + console.print("[dim] • Greedy: Always pick highest probability (deterministic, repetitive)[/dim]") + console.print("[dim] • Temperature sampling: Adjust probability sharpness (creativity control)[/dim]") + console.print("[dim] • Top-k: Sample from top k most likely tokens (balanced)[/dim]") + console.print("[dim] • Nucleus (top-p): Sample from smallest set with cumulative p (adaptive)[/dim]") + console.print() + # Demo 6: Autoregressive Generation print("🔄 Demo 6: Autoregressive Generation") print("Why we generate one token at a time...") @@ -222,6 +265,12 @@ def demo_language(): print(" • Enables controllable generation") print() + console.print("[dim]💡 [bold]Mathematical Foundation:[/bold] Chain rule of probability:[/dim]") + console.print("[dim] • Decomposes joint probability into conditional probabilities[/dim]") + console.print("[dim] • Each token depends on entire history[/dim]") + console.print("[dim] • This is why transformers need attention - to see all history![/dim]") + console.print() + # Demo 7: Training vs Inference print("🎓 Demo 7: Training vs Inference") print("Different processes for learning vs generating...") @@ -261,6 +310,13 @@ def demo_language(): print(f" {name}: {vocab} vocab, {dims}, {layers} → {capability}") print() + + console.print("[dim]💡 [bold]Scaling Laws:[/bold] Bigger models are qualitatively different:[/dim]") + console.print("[dim] • 10× parameters ≈ predictable performance gain[/dim]") + console.print("[dim] • Emergent abilities appear at scale thresholds[/dim]") + console.print("[dim] • In-context learning emerges around 1B parameters[/dim]") + console.print("[dim] • Reasoning emerges around 100B parameters[/dim]") + console.print() print("Emergent capabilities with scale:") print(" • Few-shot learning (learn from examples)") print(" • Chain-of-thought reasoning") @@ -333,6 +389,13 @@ def demo_language(): print("This knowledge will serve you in any AI/ML career path.") print() + console.print("[dim]💡 [bold]Your Achievement:[/bold] You've built every component of modern AI:[/dim]") + console.print("[dim] • You understand the math (tensors, gradients, optimization)[/dim]") + console.print("[dim] • You understand the engineering (memory, compute, scaling)[/dim]") + console.print("[dim] • You understand the systems (training, deployment, monitoring)[/dim]") + console.print("[dim] • Most importantly: You built it all yourself![/dim]") + console.print() + print("🏆 TinyTorch Language Generation Demo Complete!") print("🎯 Final Achievements:") print(" • Understood language modeling as a prediction task") diff --git a/demos/demo_single_neuron.py b/demos/demo_single_neuron.py index cb172beb..4acc0e18 100644 --- a/demos/demo_single_neuron.py +++ b/demos/demo_single_neuron.py @@ -31,9 +31,26 @@ def demo_single_neuron(): )) console.print() + # What this demo shows + console.print(Panel( + "[bold yellow]What This Demo Shows:[/bold yellow]\n\n" + "We're going to watch a single neuron (the basic unit of neural networks) learn to solve\n" + "the AND gate problem through gradient descent. You'll see:\n\n" + "• How random weights produce wrong answers initially\n" + "• How the neuron adjusts its weights based on errors\n" + "• The formation of a decision boundary that separates 0s from 1s\n" + "• Why some problems (AND) are learnable while others (XOR) need multiple layers\n\n" + "[bold cyan]Key Insight:[/bold cyan] A neuron is just a weighted sum followed by an activation function.\n" + "Learning means finding the right weights!", + title="📚 Understanding This Demo", + style="blue" + )) + console.print() + # Demo 1: The AND gate problem console.print(Panel( - "Teaching a neuron digital logic...", + "The AND gate outputs 1 only when BOTH inputs are 1.\n" + "This is a 'linearly separable' problem - a single line can divide the outputs.", title="⚡ Demo 1: The AND Gate Learning Problem", style="green" )) @@ -56,6 +73,10 @@ def demo_single_neuron(): console.print(and_table) console.print() + console.print("[dim]💡 [bold]How to Read This:[/bold] The AND gate is like a logical 'both must be true' operator.[/dim]") + console.print("[dim] Notice only the last row (1 AND 1) outputs 1. Our neuron needs to learn this pattern![/dim]") + console.print() + # Demo 2: Manual neuron implementation console.print(Panel( "Understanding: output = sigmoid(w1*x1 + w2*x2 + bias)", @@ -161,6 +182,12 @@ def demo_single_neuron(): console.print(training_table) console.print() + console.print("[dim]💡 [bold]What's Happening:[/bold] Watch the error decrease as the neuron learns![/dim]") + console.print("[dim] • Error measures how wrong our predictions are (lower is better)[/dim]") + console.print("[dim] • Weights are adjusting to reduce this error through gradient descent[/dim]") + console.print("[dim] • The bias shifts the decision boundary position[/dim]") + console.print() + # Final predictions console.print("[bold green]🎯 Final Results After Training:[/bold green]") z_final = tt.Tensor(X.data @ weights.data + bias.data) diff --git a/demos/demo_tensor_math.py b/demos/demo_tensor_math.py index 297f99d3..3335e52a 100644 --- a/demos/demo_tensor_math.py +++ b/demos/demo_tensor_math.py @@ -28,6 +28,22 @@ def demo_tensor_math(): )) console.print() + # What this demo shows + console.print(Panel( + "[bold yellow]What This Demo Shows:[/bold yellow]\n\n" + "Tensors are the foundation of all neural networks - they're just multi-dimensional arrays\n" + "that can represent scalars, vectors, matrices, and higher dimensions. You'll see:\n\n" + "• Solving systems of linear equations (finding x in Ax = b)\n" + "• Geometric transformations with rotation matrices\n" + "• Batch processing - operating on multiple data points simultaneously\n" + "• How neural network weights are just matrices doing transformations\n\n" + "[bold cyan]Key Insight:[/bold cyan] Every neural network operation is matrix multiplication at its core.\n" + "Understanding tensors means understanding how neural networks compute!", + title="📚 Understanding This Demo", + style="blue" + )) + console.print() + # Demo 1: Solve system of linear equations console.print(Panel( "System: 2x + 3y = 13\n 1x + 1y = 5", @@ -74,6 +90,10 @@ def demo_tensor_math(): console.print(verify_table) console.print() + console.print("[dim]💡 [bold]What Just Happened:[/bold] We solved for x=2, y=3 using matrix operations![/dim]") + console.print("[dim] This is exactly how neural networks solve for optimal weights during training.[/dim]") + console.print() + # Demo 2: Matrix transformation (rotation) console.print(Panel( "Rotating point (1, 0) by 45°...", diff --git a/demos/demo_training.py b/demos/demo_training.py index 39c77148..34bcdbf9 100644 --- a/demos/demo_training.py +++ b/demos/demo_training.py @@ -34,6 +34,22 @@ def demo_training(): )) console.print() + # What this demo shows + console.print(Panel( + "[bold yellow]What This Demo Shows:[/bold yellow]\n\n" + "This is where everything comes together - a complete training pipeline that takes\n" + "random weights and produces a working classifier. You'll witness:\n\n" + "• Data preparation and batching for efficient training\n" + "• The training loop: forward pass → loss calculation → backpropagation\n" + "• Real-time learning progress with loss and accuracy metrics\n" + "• Model evaluation and deployment considerations\n\n" + "[bold cyan]Key Insight:[/bold cyan] Training is an optimization process - we iteratively adjust weights\n" + "to minimize prediction errors. Watch the loss decrease and accuracy increase!", + title="📚 Understanding This Demo", + style="blue" + )) + console.print() + # Demo 1: The Training Problem print("🎯 Demo 1: The Machine Learning Training Challenge") print("From random weights to intelligent behavior...") diff --git a/demos/demo_vision.py b/demos/demo_vision.py index 11dd436c..5dce020c 100644 --- a/demos/demo_vision.py +++ b/demos/demo_vision.py @@ -31,6 +31,23 @@ def demo_vision(): )) console.print() + # What this demo shows + console.print(Panel( + "[bold yellow]What This Demo Shows:[/bold yellow]\n\n" + "Convolutional neural networks (CNNs) revolutionized computer vision by learning to detect\n" + "visual patterns hierarchically. You'll understand:\n\n" + "• How digital images are just 2D arrays of numbers (tensors)\n" + "• How convolution operations scan images to detect local patterns\n" + "• Why edge detection is fundamental - edges define object boundaries\n" + "• How multiple filters create different 'views' of the same image\n" + "• Why CNNs build hierarchical features: edges → textures → shapes → objects\n\n" + "[bold cyan]Key Insight:[/bold cyan] CNNs automatically learn which patterns matter for your task.\n" + "Early layers detect simple edges, deeper layers combine them into complex features!", + title="📚 Understanding This Demo", + style="blue" + )) + console.print() + # Demo 1: The Image Processing Foundation print("🖼️ Demo 1: Digital Images as Tensors") print("Understanding how computers see...") @@ -54,6 +71,12 @@ def demo_vision(): print(f"Pixel values: {np.unique(image.data)} (0=black, 1=white)") print() + console.print("[dim]💡 [bold]How to Read This:[/bold] Each symbol represents a pixel value:[/dim]") + console.print("[dim] • █ = 1 (white/bright pixel), · = 0 (black/dark pixel)[/dim]") + console.print("[dim] • This diamond pattern is what the computer 'sees' as numbers[/dim]") + console.print("[dim] • Real images have values 0-255, but the principle is the same[/dim]") + console.print() + # Demo 2: Edge Detection - Computer Vision's Foundation print("🔍 Demo 2: Edge Detection - How Computers Find Shapes") print("Using convolution to detect edges...") @@ -91,6 +114,13 @@ def demo_vision(): print(" " + " ".join(f"{val:2.0f}" for val in row)) print() + console.print("[dim]💡 [bold]Interpreting Edge Detection:[/bold] The numbers show edge strength:[/dim]") + console.print("[dim] • Positive values = bright-to-dark transitions[/dim]") + console.print("[dim] • Negative values = dark-to-bright transitions[/dim]") + console.print("[dim] • Zero = no edge (uniform area)[/dim]") + console.print("[dim] • Larger absolute values = stronger edges[/dim]") + console.print() + # Combine edges edge_magnitude = tt.Tensor(np.sqrt(edge_x**2 + edge_y**2)) print("Combined edge magnitude:") @@ -127,6 +157,12 @@ def demo_vision(): print(" " + " ".join(f"{val:2.0f}" for val in row)) print() + console.print("[dim]💡 [bold]Understanding Feature Detection:[/bold] Each filter learns to detect specific patterns:[/dim]") + console.print("[dim] • High positive values = strong match to the pattern[/dim]") + console.print("[dim] • Near zero = pattern not present[/dim]") + console.print("[dim] • In real CNNs, hundreds of filters learn different features automatically[/dim]") + console.print() + # Demo 4: Multi-layer Feature Extraction print("🏗️ Demo 4: Deep Feature Extraction") print("Building feature hierarchy like real CNNs...") @@ -147,6 +183,13 @@ def demo_vision(): print(" Input(5×5) → Conv2D(3×3) → ReLU → Flatten → Dense(9→5) → ReLU → Dense(5→1) → Sigmoid") print() + console.print("[dim]💡 [bold]Architecture Flow:[/bold] Data transforms through the network:[/dim]") + console.print("[dim] • Conv2D: Extracts spatial features (edges, corners)[/dim]") + console.print("[dim] • ReLU: Adds nonlinearity for complex patterns[/dim]") + console.print("[dim] • Flatten: Converts 2D features to 1D for classification[/dim]") + console.print("[dim] • Dense layers: Combine features for final decision[/dim]") + console.print() + # Set known good weights for demonstration cnn.layers[0].kernel = corner_kernel.data # Use corner detector @@ -183,6 +226,12 @@ def demo_vision(): print(" Total: ~5 MB parameters + activations") print() + console.print("[dim]💡 [bold]Scaling Insights:[/bold] Notice how parameters grow:[/dim]") + console.print("[dim] • Conv layers: Few parameters but powerful feature extraction[/dim]") + console.print("[dim] • Dense layers: Most parameters are here (fully connected)[/dim]") + console.print("[dim] • This is why modern CNNs minimize dense layers![/dim]") + console.print() + # Demo 6: Feature Visualization print("👁️ Demo 6: What CNNs Actually Learn") print("Visualizing learned features...") @@ -229,6 +278,12 @@ def demo_vision(): print() + console.print("[dim]💡 [bold]Learning Process:[/bold] CNNs discover features automatically:[/dim]") + console.print("[dim] • No need to hand-design edge detectors[/dim]") + console.print("[dim] • The network learns what patterns matter for your task[/dim]") + console.print("[dim] • Different tasks learn different features from same architecture![/dim]") + console.print() + print("🏆 TinyTorch Computer Vision Demo Complete!") print("🎯 Achievements:") print(" • Processed images as numerical tensors") diff --git a/demos/demo_xor_network.py b/demos/demo_xor_network.py index 2d422eca..b0776811 100644 --- a/demos/demo_xor_network.py +++ b/demos/demo_xor_network.py @@ -32,6 +32,22 @@ def demo_xor_network(): )) console.print() + # What this demo shows + console.print(Panel( + "[bold yellow]What This Demo Shows:[/bold yellow]\n\n" + "The XOR problem is the classic example that proved we need multi-layer networks.\n" + "A single neuron cannot solve XOR, but two layers can! You'll understand:\n\n" + "• Why XOR is 'not linearly separable' (no single line works)\n" + "• How hidden layers create intermediate features that ARE separable\n" + "• The power of depth in neural networks - each layer transforms the problem\n" + "• How modern deep learning builds on this multi-layer principle\n\n" + "[bold cyan]Key Insight:[/bold cyan] Hidden layers transform the input space into a new representation\n" + "where previously impossible problems become solvable!", + title="📚 Understanding This Demo", + style="blue" + )) + console.print() + # Demo 1: The XOR problem setup console.print(Panel( "Why single neurons fail and multi-layer networks succeed...",