mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-02 18:50:17 -05:00
Remove redundant ml_ prefix from ml_workflow chapter files and update all Quarto config references. Consolidate custom scripts into native binder subcommands and archive obsolete tooling.
72 lines
2.3 KiB
Python
72 lines
2.3 KiB
Python
# book/tools/capture_state.py
|
|
import re
|
|
import json
|
|
import sys
|
|
import glob
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Add the quarto directory to sys.path so we can import mlsys
|
|
sys.path.append(os.path.abspath("book/quarto"))
|
|
|
|
def extract_python_cells(qmd_path):
|
|
"""Extracts code from ```{python} blocks in a QMD file."""
|
|
with open(qmd_path, 'r') as f:
|
|
content = f.read()
|
|
|
|
# Regex to capture python code blocks, handling optional attributes
|
|
# Matches ```{python} or ```{python, echo=False} etc.
|
|
pattern = r"```\{python(?:[ ,].*?)?\}(.*?)```"
|
|
matches = re.findall(pattern, content, re.DOTALL)
|
|
return "\n".join(matches)
|
|
|
|
def execute_and_capture(chapter_name, code):
|
|
"""Executes code and captures string/float variables."""
|
|
# sandbox the execution
|
|
local_vars = {}
|
|
try:
|
|
exec(code, {}, local_vars)
|
|
except Exception as e:
|
|
print(f"❌ Error executing {chapter_name}: {e}")
|
|
# Print the first few lines of code to debug
|
|
print(f" Code snippet: {code[:200]}...")
|
|
return {}
|
|
|
|
# Capture only simple types (str, int, float) to avoid object serialization issues
|
|
captured = {}
|
|
for k, v in local_vars.items():
|
|
if not k.startswith("_") and isinstance(v, (str, int, float)):
|
|
captured[k] = v
|
|
return captured
|
|
|
|
def main():
|
|
qmd_files = sorted(glob.glob("book/quarto/contents/vol1/**/*.qmd", recursive=True))
|
|
baseline = {}
|
|
|
|
print(f"📸 Capturing baseline state for {len(qmd_files)} chapters...")
|
|
|
|
for qmd_file in qmd_files:
|
|
chapter_name = Path(qmd_file).stem
|
|
# Skip utility files
|
|
if chapter_name in ["404", "index", "intro", "references", "glossary"]:
|
|
continue
|
|
|
|
print(f" - Processing {chapter_name}...", end="", flush=True)
|
|
code = extract_python_cells(qmd_file)
|
|
if not code:
|
|
print(" (no code)")
|
|
continue
|
|
|
|
variables = execute_and_capture(chapter_name, code)
|
|
baseline[chapter_name] = variables
|
|
print(f" ✅ ({len(variables)} vars)")
|
|
|
|
output_path = "book/tools/baseline_state.json"
|
|
with open(output_path, 'w') as f:
|
|
json.dump(baseline, f, indent=2, sort_keys=True)
|
|
|
|
print(f"\n✨ Baseline captured to {output_path}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|