mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-03 08:08:51 -05:00
Remove redundant ml_ prefix from ml_workflow chapter files and update all Quarto config references. Consolidate custom scripts into native binder subcommands and archive obsolete tooling.
112 lines
3.8 KiB
Python
112 lines
3.8 KiB
Python
import os
|
|
import re
|
|
import glob
|
|
|
|
def audit_figures(vol_path):
|
|
print(f"Auditing Volume 1 in: {vol_path}\n")
|
|
|
|
# regex patterns
|
|
# Matches: {#fig-name} or { #fig-name ... }
|
|
img_attr_pattern = re.compile(r'\{.*#fig-([\w-]+).*\}')
|
|
# Matches: #| label: fig-name
|
|
code_label_pattern = re.compile(r'#\|\s*label:\s*fig-([\w-]+)')
|
|
# Matches: ::: {#fig-name ... }
|
|
div_id_pattern = re.compile(r':::\s*\{.*#fig-([\w-]+).*\}')
|
|
|
|
# Matches: @fig-name
|
|
ref_pattern = re.compile(r'@fig-([\w-]+)')
|
|
|
|
qmd_files = glob.glob(os.path.join(vol_path, "**/*.qmd"), recursive=True)
|
|
|
|
audit_report = {}
|
|
|
|
for fpath in qmd_files:
|
|
with open(fpath, 'r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
rel_path = os.path.relpath(fpath, vol_path)
|
|
defined_figs = {} # id -> line_num
|
|
referenced_figs = [] # (id, line_num, context)
|
|
|
|
# Pass 1: Find definitions
|
|
for i, line in enumerate(lines):
|
|
# Check for image/div attributes
|
|
m_attr = img_attr_pattern.search(line)
|
|
if m_attr:
|
|
defined_figs[m_attr.group(1)] = i + 1
|
|
continue
|
|
|
|
# Check for code labels
|
|
m_code = code_label_pattern.search(line)
|
|
if m_code:
|
|
defined_figs[m_code.group(1)] = i + 1
|
|
continue
|
|
|
|
# Check for div ids
|
|
m_div = div_id_pattern.search(line)
|
|
if m_div:
|
|
defined_figs[m_div.group(1)] = i + 1
|
|
continue
|
|
|
|
# Pass 2: Find references and context
|
|
for i, line in enumerate(lines):
|
|
for m_ref in ref_pattern.finditer(line):
|
|
fig_id = m_ref.group(1)
|
|
# Get context: previous line + current line + next line
|
|
start_ctx = max(0, i - 1)
|
|
end_ctx = min(len(lines), i + 2)
|
|
context = "".join([l.strip() + " " for l in lines[start_ctx:end_ctx]])
|
|
referenced_figs.append({
|
|
'id': fig_id,
|
|
'line': i + 1,
|
|
'context': context
|
|
})
|
|
|
|
if defined_figs or referenced_figs:
|
|
audit_report[rel_path] = {
|
|
'definitions': defined_figs,
|
|
'references': referenced_figs
|
|
}
|
|
|
|
# Analysis
|
|
print("-" * 60)
|
|
print("FIGURE AUDIT REPORT")
|
|
print("-" * 60)
|
|
|
|
total_unreferenced = 0
|
|
|
|
for fname, data in audit_report.items():
|
|
defs = set(data['definitions'].keys())
|
|
refs = set(r['id'] for r in data['references'])
|
|
|
|
# 1. Unreferenced Figures
|
|
unref = defs - refs
|
|
if unref:
|
|
print(f"\n[UNREFERENCED] {fname}:")
|
|
for fig in unref:
|
|
print(f" - fig-{fig} (Line {data['definitions'][fig]})")
|
|
total_unreferenced += 1
|
|
|
|
# 2. Undefined References (Broken Links)
|
|
# Note: References might point to other chapters, so this is just a warning
|
|
undef = refs - defs
|
|
# if undef:
|
|
# print(f"\n[EXTERNAL/UNDEFINED REF] {fname}:")
|
|
# for fig in undef:
|
|
# print(f" - @fig-{fig}")
|
|
|
|
print(f"\nTotal Unreferenced Figures: {total_unreferenced}")
|
|
print("-" * 60)
|
|
|
|
# 3. Reference Quality Check (Sampling)
|
|
print("\nREFERENCE CONTEXT SAMPLE (Check for explanation quality):")
|
|
for fname, data in audit_report.items():
|
|
if not data['references']: continue
|
|
print(f"\nFile: {fname}")
|
|
for ref in data['references']:
|
|
if ref['id'] in data['definitions']: # Only check local refs for now
|
|
print(f" Line {ref['line']} (@fig-{ref['id']}): ...{ref['context'][:200]}...")
|
|
|
|
if __name__ == "__main__":
|
|
audit_figures("quarto/contents/vol1")
|