mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 02:03:55 -05:00
123 lines
5.1 KiB
Python
123 lines
5.1 KiB
Python
import os
|
|
import glob
|
|
import subprocess
|
|
import urllib.request
|
|
import urllib.parse
|
|
from bs4 import BeautifulSoup
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
# Default paths assuming the script is run from the repository root
|
|
book_dir = "book/quarto/contents"
|
|
out_dir = ".claude/_reviews/Figure Audit"
|
|
img_tmp_dir = ".claude/_reviews/Figure Audit/audit_images_tmp"
|
|
brief_path = ".claude/_plans/figure-audit-brief.md"
|
|
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
os.makedirs(img_tmp_dir, exist_ok=True)
|
|
|
|
qmd_files = []
|
|
for vol in ['vol1', 'vol2']:
|
|
vol_dir = os.path.join(book_dir, vol)
|
|
for root, _, files in os.walk(vol_dir):
|
|
for f in files:
|
|
if f.endswith('.qmd') and not f.startswith('_'):
|
|
qmd_files.append(os.path.join(root, f))
|
|
|
|
chapters_with_figures = []
|
|
for qmd in qmd_files:
|
|
with open(qmd, 'r', encoding='utf-8') as f:
|
|
if 'fig-cap' in f.read():
|
|
chapters_with_figures.append(qmd)
|
|
|
|
print(f"Found {len(chapters_with_figures)} chapters with figures.")
|
|
|
|
def download_file(url, local_path):
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
try:
|
|
with urllib.request.urlopen(req) as response, open(local_path, 'wb') as out_file:
|
|
data = response.read()
|
|
out_file.write(data)
|
|
return True
|
|
except Exception as e:
|
|
print(f"Failed to download {url}: {e}")
|
|
return False
|
|
|
|
def process_chapter(qmd_path):
|
|
rel_path = os.path.relpath(qmd_path, book_dir)
|
|
parts = rel_path.split(os.sep)
|
|
vol = parts[0]
|
|
|
|
if len(parts) > 2:
|
|
chap = parts[-2]
|
|
else:
|
|
chap = parts[-1].replace('.qmd', '')
|
|
|
|
out_name = f"{vol}-{chap}.yml"
|
|
out_path = os.path.join(out_dir, out_name)
|
|
|
|
if os.path.exists(out_path):
|
|
return f"Skipping {out_name}, already exists."
|
|
|
|
html_url = f"https://harvard-edge.github.io/cs249r_book_dev/{vol}/contents/{rel_path.replace('.qmd', '.html')}"
|
|
|
|
local_images = []
|
|
try:
|
|
req = urllib.request.Request(html_url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
html_content = urllib.request.urlopen(req).read().decode('utf-8')
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
for figure in soup.find_all('figure'):
|
|
fig_id = figure.get('id', 'unknown_fig')
|
|
img_tag = figure.find('img')
|
|
svg_tag = figure.find('svg')
|
|
|
|
if img_tag and img_tag.get('src'):
|
|
src = img_tag['src']
|
|
abs_url = urllib.parse.urljoin(html_url, src)
|
|
ext = os.path.splitext(urllib.parse.urlparse(abs_url).path)[1] or '.png'
|
|
local_img_path = os.path.join(img_tmp_dir, f"{vol}_{chap}_{fig_id}{ext}")
|
|
if download_file(abs_url, local_img_path):
|
|
local_images.append(f"{fig_id}: {os.path.abspath(local_img_path)}")
|
|
elif svg_tag:
|
|
local_img_path = os.path.join(img_tmp_dir, f"{vol}_{chap}_{fig_id}.svg")
|
|
with open(local_img_path, 'w', encoding='utf-8') as f:
|
|
f.write(str(svg_tag))
|
|
local_images.append(f"{fig_id}: {os.path.abspath(local_img_path)}")
|
|
|
|
except Exception as e:
|
|
print(f"Error parsing HTML for {chap}: {e}")
|
|
|
|
images_instruction = "\\n".join(local_images)
|
|
|
|
prompt = f"""You are the Three-Artifact Figure Audit agent. Read the brief at: {os.path.abspath(brief_path)}
|
|
Your task is to audit the chapter located at: {os.path.abspath(qmd_path)}
|
|
|
|
Instructions:
|
|
1. I have downloaded the raw rendered HTML images to your local disk so you can visually audit them.
|
|
Use the `read_file` tool on the following local paths to view the visual figures:
|
|
{images_instruction}
|
|
|
|
2. Compare the image you see via `read_file` to the prose, `fig-cap`, and `fig-alt` in the source QMD file at {os.path.abspath(qmd_path)}.
|
|
3. Output your findings strictly in the YAML format specified in the brief. Write the final YAML report to: {os.path.abspath(out_path)}
|
|
|
|
CRITICAL CONSTRAINTS FOR FIXES:
|
|
- **You CANNOT change the figures/images themselves.** The images are immutable.
|
|
- Your `proposed_fix` in the YAML MUST ONLY target the prose, the caption (`fig-cap`), or the alt-text (`fig-alt`) in the source `.qmd` file.
|
|
- **DO NOT rewrite the caption or prose completely.** You MUST propose the most MINIMAL, surgical tweaks to the existing `.qmd` text so that it accurately aligns with what is actually shown in the immutable image.
|
|
|
|
Do not ask for permission. Act autonomously to complete the audit and write the file."""
|
|
|
|
print(f"Starting {out_name}...")
|
|
cmd = ["gemini", "-m", "gemini-3.1-pro-preview", "-y", "-p", prompt]
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
return f"Success: {out_name}"
|
|
except subprocess.CalledProcessError as e:
|
|
error_msg = e.stderr if e.stderr else e.stdout
|
|
return f"Failed: {out_name}\nError: {error_msg[-500:]}"
|
|
|
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
|
results = list(executor.map(process_chapter, chapters_with_figures))
|
|
|
|
print("\\n--- Audit Run Complete ---")
|