cs249r_book/book/tools/scripts/_archive/obsolete/audit_figures.py

import os
import re
import glob

def audit_figures(vol_path):
    print(f"Auditing Volume 1 in: {vol_path}\n")

    # regex patterns
    # Matches: {#fig-name} or { #fig-name ... }
    img_attr_pattern = re.compile(r'\{.*#fig-([\w-]+).*\}')
    # Matches: #| label: fig-name
    code_label_pattern = re.compile(r'#\|\s*label:\s*fig-([\w-]+)')
    # Matches: ::: {#fig-name ... }
    div_id_pattern = re.compile(r':::\s*\{.*#fig-([\w-]+).*\}')

    # Matches: @fig-name
    ref_pattern = re.compile(r'@fig-([\w-]+)')

    qmd_files = glob.glob(os.path.join(vol_path, "**/*.qmd"), recursive=True)

    audit_report = {}

    for fpath in qmd_files:
        with open(fpath, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        rel_path = os.path.relpath(fpath, vol_path)
        defined_figs = {} # id -> line_num
        referenced_figs = [] # (id, line_num, context)

        # Pass 1: Find definitions
        for i, line in enumerate(lines):
            # Check for image/div attributes
            m_attr = img_attr_pattern.search(line)
            if m_attr:
                defined_figs[m_attr.group(1)] = i + 1
                continue

            # Check for code labels
            m_code = code_label_pattern.search(line)
            if m_code:
                defined_figs[m_code.group(1)] = i + 1
                continue

            # Check for div ids
            m_div = div_id_pattern.search(line)
            if m_div:
                defined_figs[m_div.group(1)] = i + 1
                continue

        # Pass 2: Find references and context
        for i, line in enumerate(lines):
            for m_ref in ref_pattern.finditer(line):
                fig_id = m_ref.group(1)
                # Get context: previous line + current line + next line
                start_ctx = max(0, i - 1)
                end_ctx = min(len(lines), i + 2)
                context = "".join([l.strip() + " " for l in lines[start_ctx:end_ctx]])
                referenced_figs.append({
                    'id': fig_id,
                    'line': i + 1,
                    'context': context
                })

        if defined_figs or referenced_figs:
            audit_report[rel_path] = {
                'definitions': defined_figs,
                'references': referenced_figs
            }

    # Analysis
    print("-" * 60)
    print("FIGURE AUDIT REPORT")
    print("-" * 60)

    total_unreferenced = 0

    for fname, data in audit_report.items():
        defs = set(data['definitions'].keys())
        refs = set(r['id'] for r in data['references'])

        # 1. Unreferenced Figures
        unref = defs - refs
        if unref:
            print(f"\n[UNREFERENCED] {fname}:")
            for fig in unref:
                print(f"  - fig-{fig} (Line {data['definitions'][fig]})")
                total_unreferenced += 1

        # 2. Undefined References (Broken Links)
        # Note: References might point to other chapters, so this is just a warning
        undef = refs - defs
        # if undef:
        #     print(f"\n[EXTERNAL/UNDEFINED REF] {fname}:")
        #     for fig in undef:
        #         print(f"  - @fig-{fig}")

    print(f"\nTotal Unreferenced Figures: {total_unreferenced}")
    print("-" * 60)

    # 3. Reference Quality Check (Sampling)
    print("\nREFERENCE CONTEXT SAMPLE (Check for explanation quality):")
    for fname, data in audit_report.items():
        if not data['references']: continue
        print(f"\nFile: {fname}")
        for ref in data['references']:
            if ref['id'] in data['definitions']: # Only check local refs for now
                print(f"  Line {ref['line']} (@fig-{ref['id']}): ...{ref['context'][:200]}...")

if __name__ == "__main__":
    audit_figures("quarto/contents/vol1")