#!/usr/bin/env python3 """ TinyTorch Module Structure and Educational Scaffolding Analysis This script analyzes the educational content across all modules to identify: 1. Module length and complexity metrics 2. Cell-by-cell breakdown and learning progression 3. Potential student overwhelm points 4. Test anxiety sources 5. Scaffolding effectiveness Focus: Machine Learning Systems education with proper learning progression """ import os import re import ast from pathlib import Path from dataclasses import dataclass from typing import List, Dict, Tuple, Optional import statistics @dataclass class CellAnalysis: """Analysis of a single notebook cell""" cell_type: str # markdown, code, export, etc. line_count: int char_count: int complexity_score: int # 1-5 scale educational_type: str # concept, implementation, test, etc. has_todo: bool has_hints: bool concepts_introduced: List[str] @dataclass class ModuleAnalysis: """Comprehensive analysis of a module""" name: str path: str total_lines: int total_cells: int cell_analyses: List[CellAnalysis] concepts_covered: List[str] learning_progression: List[str] test_count: int todo_count: int hint_count: int complexity_distribution: Dict[int, int] potential_overwhelm_points: List[str] scaffolding_quality: int # 1-5 scale class NotebookAnalyzer: """Analyzes TinyTorch development notebooks for educational effectiveness""" def __init__(self, modules_dir: str = "modules/source"): self.modules_dir = Path(modules_dir) self.module_analyses: List[ModuleAnalysis] = [] def analyze_all_modules(self) -> Dict[str, ModuleAnalysis]: """Analyze all modules in the source directory""" results = {} for module_dir in sorted(self.modules_dir.iterdir()): if module_dir.is_dir() and module_dir.name.startswith(('00_', '01_', '02_', '03_', '04_', '05_', '06_', '07_')): print(f"\nšŸ“š Analyzing {module_dir.name}...") analysis = self.analyze_module(module_dir) results[module_dir.name] = analysis self.module_analyses.append(analysis) return results def analyze_module(self, module_path: Path) -> ModuleAnalysis: """Analyze a single module for educational effectiveness""" # Find the main development file dev_files = list(module_path.glob("*_dev.py")) if not dev_files: print(f"āš ļø No _dev.py file found in {module_path}") return self._create_empty_analysis(module_path.name, str(module_path)) dev_file = dev_files[0] with open(dev_file, 'r', encoding='utf-8') as f: content = f.read() # Parse the file structure cells = self._parse_jupytext_cells(content) cell_analyses = [self._analyze_cell(cell) for cell in cells] # Count tests test_dir = module_path / "tests" test_count = len(list(test_dir.glob("test_*.py"))) if test_dir.exists() else 0 # Analyze overall structure concepts = self._extract_concepts(content) progression = self._analyze_learning_progression(cell_analyses) overwhelm_points = self._identify_overwhelm_points(cell_analyses) scaffolding_quality = self._assess_scaffolding_quality(cell_analyses) return ModuleAnalysis( name=module_path.name, path=str(module_path), total_lines=len(content.split('\n')), total_cells=len(cells), cell_analyses=cell_analyses, concepts_covered=concepts, learning_progression=progression, test_count=test_count, todo_count=sum(1 for cell in cell_analyses if cell.has_todo), hint_count=sum(1 for cell in cell_analyses if cell.has_hints), complexity_distribution={i: sum(1 for cell in cell_analyses if cell.complexity_score == i) for i in range(1, 6)}, potential_overwhelm_points=overwhelm_points, scaffolding_quality=scaffolding_quality ) def _parse_jupytext_cells(self, content: str) -> List[Dict]: """Parse Jupytext percent format cells""" cells = [] current_cell = {"type": "code", "content": ""} lines = content.split('\n') i = 0 while i < len(lines): line = lines[i] if line.strip() == "# %% [markdown]": # Save current cell and start markdown cell if current_cell["content"].strip(): cells.append(current_cell) current_cell = {"type": "markdown", "content": ""} i += 1 continue elif line.strip() == "# %%": # Save current cell and start code cell if current_cell["content"].strip(): cells.append(current_cell) current_cell = {"type": "code", "content": ""} i += 1 continue # Add line to current cell current_cell["content"] += line + "\n" i += 1 # Add final cell if current_cell["content"].strip(): cells.append(current_cell) return cells def _analyze_cell(self, cell: Dict) -> CellAnalysis: """Analyze a single cell for educational metrics""" content = cell["content"] lines = content.split('\n') # Basic metrics line_count = len([l for l in lines if l.strip()]) char_count = len(content) # Educational analysis has_todo = "TODO:" in content or "NotImplementedError" in content has_hints = "HINT" in content or "APPROACH:" in content or "EXAMPLE:" in content # Complexity scoring (1-5 scale) complexity = self._calculate_complexity(content, cell["type"]) # Educational type classification edu_type = self._classify_educational_type(content, cell["type"]) # Extract concepts concepts = self._extract_cell_concepts(content, cell["type"]) return CellAnalysis( cell_type=cell["type"], line_count=line_count, char_count=char_count, complexity_score=complexity, educational_type=edu_type, has_todo=has_todo, has_hints=has_hints, concepts_introduced=concepts ) def _calculate_complexity(self, content: str, cell_type: str) -> int: """Calculate complexity score 1-5 for a cell""" if cell_type == "markdown": # Markdown complexity based on mathematical content and length math_indicators = content.count('$') + content.count('\\') + content.count('equation') length_factor = min(len(content) // 500, 3) # 0-3 based on length return min(1 + math_indicators // 4 + length_factor, 5) else: # code cell # Code complexity based on various factors complexity = 1 # AST complexity (if parseable) try: tree = ast.parse(content) complexity += len([node for node in ast.walk(tree) if isinstance(node, (ast.FunctionDef, ast.ClassDef))]) // 2 complexity += len([node for node in ast.walk(tree) if isinstance(node, (ast.For, ast.While, ast.If))]) // 3 except: # If not parseable, use simpler heuristics complexity += content.count('def ') + content.count('class ') complexity += content.count('for ') + content.count('while ') + content.count('if ') # Length factor complexity += min(len(content.split('\n')) // 20, 2) return min(complexity, 5) def _classify_educational_type(self, content: str, cell_type: str) -> str: """Classify the educational purpose of a cell""" if cell_type == "markdown": if any(word in content.lower() for word in ["step", "what is", "definition", "concept"]): return "concept_introduction" elif any(word in content.lower() for word in ["example", "visual", "analogy"]): return "example_illustration" elif any(word in content.lower() for word in ["summary", "recap", "conclusion"]): return "concept_reinforcement" else: return "explanation" else: # code if "TODO:" in content or "NotImplementedError" in content: return "student_implementation" elif "#| export" in content: return "solution_code" elif "test" in content.lower() or "assert" in content: return "verification" elif "import" in content: return "setup" else: return "demonstration" def _extract_cell_concepts(self, content: str, cell_type: str) -> List[str]: """Extract key concepts introduced in this cell""" concepts = [] if cell_type == "markdown": # Look for concept indicators lines = content.split('\n') for line in lines: if line.startswith('#'): # Extract from headers concept = line.strip('#').strip() if concept and len(concept) < 50: concepts.append(concept) elif '**' in line: # Extract from bold text bold_matches = re.findall(r'\*\*(.*?)\*\*', line) concepts.extend([match for match in bold_matches if len(match) < 30]) else: # code # Extract class and function names try: tree = ast.parse(content) for node in ast.walk(tree): if isinstance(node, ast.ClassDef): concepts.append(f"Class: {node.name}") elif isinstance(node, ast.FunctionDef): concepts.append(f"Function: {node.name}") except: pass return concepts[:5] # Limit to top 5 concepts def _extract_concepts(self, content: str) -> List[str]: """Extract all major concepts from module content""" concepts = set() # Extract from headers headers = re.findall(r'^#+\s+(.+)$', content, re.MULTILINE) concepts.update([h.strip() for h in headers if len(h.strip()) < 50]) # Extract from class/function definitions try: tree = ast.parse(content) for node in ast.walk(tree): if isinstance(node, ast.ClassDef): concepts.add(node.name) elif isinstance(node, ast.FunctionDef) and not node.name.startswith('_'): concepts.add(node.name) except: pass return sorted(list(concepts)) def _analyze_learning_progression(self, cell_analyses: List[CellAnalysis]) -> List[str]: """Analyze the learning progression through the module""" progression = [] for i, cell in enumerate(cell_analyses): if cell.educational_type == "concept_introduction": progression.append(f"Step {len(progression)+1}: Concept Introduction") elif cell.educational_type == "student_implementation": progression.append(f"Step {len(progression)+1}: Hands-on Implementation") elif cell.educational_type == "verification": progression.append(f"Step {len(progression)+1}: Verification & Testing") return progression def _identify_overwhelm_points(self, cell_analyses: List[CellAnalysis]) -> List[str]: """Identify potential student overwhelm points""" overwhelm_points = [] for i, cell in enumerate(cell_analyses): # Long cells without scaffolding if cell.line_count > 50 and not cell.has_hints: overwhelm_points.append(f"Cell {i+1}: Long implementation without guidance ({cell.line_count} lines)") # High complexity without TODO structure if cell.complexity_score >= 4 and not cell.has_todo: overwhelm_points.append(f"Cell {i+1}: High complexity without student scaffolding") # Sudden complexity jumps if i > 0 and cell.complexity_score - cell_analyses[i-1].complexity_score >= 3: overwhelm_points.append(f"Cell {i+1}: Sudden complexity jump from {cell_analyses[i-1].complexity_score} to {cell.complexity_score}") return overwhelm_points def _assess_scaffolding_quality(self, cell_analyses: List[CellAnalysis]) -> int: """Assess overall scaffolding quality (1-5 scale)""" if not cell_analyses: return 1 score = 3 # Start with average # Positive factors implementation_cells = [c for c in cell_analyses if c.educational_type == "student_implementation"] if implementation_cells: hint_ratio = sum(1 for c in implementation_cells if c.has_hints) / len(implementation_cells) score += hint_ratio * 2 # Up to +2 for good hint coverage # Check for good progression concept_cells = [c for c in cell_analyses if c.educational_type == "concept_introduction"] if len(concept_cells) >= 2: score += 0.5 # Good conceptual foundation # Negative factors overwhelm_ratio = len([c for c in cell_analyses if c.complexity_score >= 4]) / len(cell_analyses) if overwhelm_ratio > 0.3: score -= 1 # Too many high-complexity cells return max(1, min(5, int(score))) def _create_empty_analysis(self, name: str, path: str) -> ModuleAnalysis: """Create empty analysis for modules without dev files""" return ModuleAnalysis( name=name, path=path, total_lines=0, total_cells=0, cell_analyses=[], concepts_covered=[], learning_progression=[], test_count=0, todo_count=0, hint_count=0, complexity_distribution={i: 0 for i in range(1, 6)}, potential_overwhelm_points=[], scaffolding_quality=1 ) def generate_report(self) -> str: """Generate comprehensive analysis report""" if not self.module_analyses: return "No modules analyzed yet. Run analyze_all_modules() first." report = [] report.append("# TinyTorch Educational Content Analysis Report") report.append("=" * 50) # Overall statistics total_lines = sum(m.total_lines for m in self.module_analyses) total_cells = sum(m.total_cells for m in self.module_analyses) avg_scaffolding = statistics.mean(m.scaffolding_quality for m in self.module_analyses) report.append(f"\n## šŸ“Š Overall Statistics") report.append(f"- Total modules analyzed: {len(self.module_analyses)}") report.append(f"- Total lines of content: {total_lines:,}") report.append(f"- Total cells: {total_cells}") report.append(f"- Average scaffolding quality: {avg_scaffolding:.1f}/5.0") # Module-by-module breakdown report.append(f"\n## šŸ“š Module-by-Module Analysis") for analysis in self.module_analyses: report.append(f"\n### {analysis.name}") report.append(f"- **Lines**: {analysis.total_lines:,}") report.append(f"- **Cells**: {analysis.total_cells}") report.append(f"- **Concepts**: {len(analysis.concepts_covered)}") report.append(f"- **TODOs**: {analysis.todo_count}") report.append(f"- **Hints**: {analysis.hint_count}") report.append(f"- **Tests**: {analysis.test_count}") report.append(f"- **Scaffolding Quality**: {analysis.scaffolding_quality}/5") if analysis.potential_overwhelm_points: report.append(f"- **āš ļø Potential Overwhelm Points**:") for point in analysis.potential_overwhelm_points[:3]: # Show top 3 report.append(f" - {point}") # Recommendations report.append(f"\n## šŸŽÆ Educational Recommendations") # Identify modules needing attention low_scaffolding = [m for m in self.module_analyses if m.scaffolding_quality <= 2] high_complexity = [] for m in self.module_analyses: if m.total_cells > 0: # Avoid division by zero complex_cells = m.complexity_distribution.get(4, 0) + m.complexity_distribution.get(5, 0) if complex_cells > m.total_cells * 0.3: high_complexity.append(m) if low_scaffolding: report.append(f"\n### 🚨 Modules Needing Better Scaffolding:") for module in low_scaffolding: report.append(f"- **{module.name}**: Quality {module.scaffolding_quality}/5") if high_complexity: report.append(f"\n### šŸ“ˆ Modules with High Complexity:") for module in high_complexity: complex_ratio = (module.complexity_distribution.get(4, 0) + module.complexity_distribution.get(5, 0)) / max(module.total_cells, 1) report.append(f"- **{module.name}**: {complex_ratio:.1%} high-complexity cells") # Best practices recommendations report.append(f"\n### āœ… Recommended Best Practices:") if self.module_analyses: min_lines = min(m.total_lines for m in self.module_analyses if m.total_lines > 0) max_lines = max(m.total_lines for m in self.module_analyses) report.append(f"- **Ideal module length**: 200-400 lines (current range: {min_lines}-{max_lines})") else: report.append(f"- **Ideal module length**: 200-400 lines") report.append(f"- **Cell complexity**: Max 30% high-complexity cells") report.append(f"- **Scaffolding ratio**: All implementation cells should have hints") report.append(f"- **Progression**: Concept → Example → Implementation → Verification") return "\n".join(report) if __name__ == "__main__": analyzer = NotebookAnalyzer() results = analyzer.analyze_all_modules() print("\n" + "="*60) print(analyzer.generate_report()) # Save detailed report with open("educational_analysis_report.md", "w") as f: f.write(analyzer.generate_report()) print(f"\nšŸ“„ Detailed report saved to: educational_analysis_report.md")