Files
cs249r_book/book/tools/scripts/_archive/obsolete/audit_figures.py
Vijay Janapa Reddi e3cc9f7af3 refactor: rename ml_ml_workflow files, consolidate CLI, and clean up scripts
Remove redundant ml_ prefix from ml_workflow chapter files and update all
Quarto config references. Consolidate custom scripts into native binder
subcommands and archive obsolete tooling.
2026-02-13 11:06:28 -05:00

112 lines
3.8 KiB
Python

import os
import re
import glob
def audit_figures(vol_path):
print(f"Auditing Volume 1 in: {vol_path}\n")
# regex patterns
# Matches: {#fig-name} or { #fig-name ... }
img_attr_pattern = re.compile(r'\{.*#fig-([\w-]+).*\}')
# Matches: #| label: fig-name
code_label_pattern = re.compile(r'#\|\s*label:\s*fig-([\w-]+)')
# Matches: ::: {#fig-name ... }
div_id_pattern = re.compile(r':::\s*\{.*#fig-([\w-]+).*\}')
# Matches: @fig-name
ref_pattern = re.compile(r'@fig-([\w-]+)')
qmd_files = glob.glob(os.path.join(vol_path, "**/*.qmd"), recursive=True)
audit_report = {}
for fpath in qmd_files:
with open(fpath, 'r', encoding='utf-8') as f:
lines = f.readlines()
rel_path = os.path.relpath(fpath, vol_path)
defined_figs = {} # id -> line_num
referenced_figs = [] # (id, line_num, context)
# Pass 1: Find definitions
for i, line in enumerate(lines):
# Check for image/div attributes
m_attr = img_attr_pattern.search(line)
if m_attr:
defined_figs[m_attr.group(1)] = i + 1
continue
# Check for code labels
m_code = code_label_pattern.search(line)
if m_code:
defined_figs[m_code.group(1)] = i + 1
continue
# Check for div ids
m_div = div_id_pattern.search(line)
if m_div:
defined_figs[m_div.group(1)] = i + 1
continue
# Pass 2: Find references and context
for i, line in enumerate(lines):
for m_ref in ref_pattern.finditer(line):
fig_id = m_ref.group(1)
# Get context: previous line + current line + next line
start_ctx = max(0, i - 1)
end_ctx = min(len(lines), i + 2)
context = "".join([l.strip() + " " for l in lines[start_ctx:end_ctx]])
referenced_figs.append({
'id': fig_id,
'line': i + 1,
'context': context
})
if defined_figs or referenced_figs:
audit_report[rel_path] = {
'definitions': defined_figs,
'references': referenced_figs
}
# Analysis
print("-" * 60)
print("FIGURE AUDIT REPORT")
print("-" * 60)
total_unreferenced = 0
for fname, data in audit_report.items():
defs = set(data['definitions'].keys())
refs = set(r['id'] for r in data['references'])
# 1. Unreferenced Figures
unref = defs - refs
if unref:
print(f"\n[UNREFERENCED] {fname}:")
for fig in unref:
print(f" - fig-{fig} (Line {data['definitions'][fig]})")
total_unreferenced += 1
# 2. Undefined References (Broken Links)
# Note: References might point to other chapters, so this is just a warning
undef = refs - defs
# if undef:
# print(f"\n[EXTERNAL/UNDEFINED REF] {fname}:")
# for fig in undef:
# print(f" - @fig-{fig}")
print(f"\nTotal Unreferenced Figures: {total_unreferenced}")
print("-" * 60)
# 3. Reference Quality Check (Sampling)
print("\nREFERENCE CONTEXT SAMPLE (Check for explanation quality):")
for fname, data in audit_report.items():
if not data['references']: continue
print(f"\nFile: {fname}")
for ref in data['references']:
if ref['id'] in data['definitions']: # Only check local refs for now
print(f" Line {ref['line']} (@fig-{ref['id']}): ...{ref['context'][:200]}...")
if __name__ == "__main__":
audit_figures("quarto/contents/vol1")