mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-02 10:39:10 -05:00
Remove redundant ml_ prefix from ml_workflow chapter files and update all Quarto config references. Consolidate custom scripts into native binder subcommands and archive obsolete tooling.
99 lines
3.7 KiB
Python
99 lines
3.7 KiB
Python
import sys
|
|
import os
|
|
|
|
# Configuration: Ordered list of chapters
|
|
CHAPTERS = [
|
|
"book/quarto/contents/vol1/introduction/introduction.qmd",
|
|
"book/quarto/contents/vol1/ml_systems/ml_systems.qmd",
|
|
"book/quarto/contents/vol1/ml_workflow/ml_workflow.qmd",
|
|
"book/quarto/contents/vol1/data_engineering/data_engineering.qmd",
|
|
"book/quarto/contents/vol1/nn_computation/nn_computation.qmd",
|
|
"book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd",
|
|
"book/quarto/contents/vol1/frameworks/frameworks.qmd",
|
|
"book/quarto/contents/vol1/training/training.qmd",
|
|
"book/quarto/contents/vol1/optimizations/model_compression.qmd",
|
|
"book/quarto/contents/vol1/hw_acceleration/hw_acceleration.qmd",
|
|
"book/quarto/contents/vol1/data_efficiency/data_efficiency.qmd",
|
|
"book/quarto/contents/vol1/benchmarking/benchmarking.qmd",
|
|
"book/quarto/contents/vol1/model_serving/model_serving.qmd",
|
|
"book/quarto/contents/vol1/ml_ops/ml_ops.qmd",
|
|
"book/quarto/contents/vol1/responsible_engr/responsible_engr.qmd",
|
|
"book/quarto/contents/vol1/conclusion/conclusion.qmd"
|
|
]
|
|
|
|
STRUCTURAL_PREFIXES = ("sec-", "fig-", "tbl-", "eq-", "ch-", "part-")
|
|
|
|
def extract_labels(line):
|
|
# Find all occurrences of {#label}
|
|
labels = []
|
|
# Simple state machine or finding start/end indices
|
|
# Assumption: labels don't span lines
|
|
start_token = "{#"
|
|
end_token = "}"
|
|
|
|
current_pos = 0
|
|
while True:
|
|
try:
|
|
start_idx = line.index(start_token, current_pos)
|
|
end_idx = line.index(end_token, start_idx)
|
|
label = line[start_idx+2:end_idx]
|
|
labels.append(label)
|
|
current_pos = end_idx + 1
|
|
except ValueError:
|
|
break
|
|
return labels
|
|
|
|
def scan_files():
|
|
defined_labels = set()
|
|
references = []
|
|
|
|
print("Scanning for labels...")
|
|
for filepath in CHAPTERS:
|
|
if not os.path.exists(filepath):
|
|
print(f"Error: File not found: {filepath}")
|
|
continue
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
for i, line in enumerate(f, 1):
|
|
labels = extract_labels(line)
|
|
for label in labels:
|
|
if label in defined_labels:
|
|
print(f"Warning: Duplicate label '{label}' found in {filepath}:{i}")
|
|
defined_labels.add(label)
|
|
|
|
print(f"Found {len(defined_labels)} unique labels.")
|
|
|
|
print("\nScanning for references...")
|
|
for filepath in CHAPTERS:
|
|
if not os.path.exists(filepath):
|
|
continue
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
for i, line in enumerate(f, 1):
|
|
# Simple tokenization to find @ref
|
|
clean_line = line.replace(']', ' ').replace('[', ' ').replace('(', ' ').replace(')', ' ').replace('.', ' ').replace(',', ' ').replace(';', ' ')
|
|
words = clean_line.split()
|
|
for word in words:
|
|
if word.startswith('@'):
|
|
ref = word[1:]
|
|
if ref.startswith(STRUCTURAL_PREFIXES):
|
|
references.append((filepath, i, ref))
|
|
|
|
print("\nValidating references...")
|
|
broken_links = []
|
|
|
|
for source, line, ref in references:
|
|
if ref not in defined_labels:
|
|
broken_links.append(f"{source}:{line} -> Reference @{ref} not found.")
|
|
|
|
if broken_links:
|
|
print(f"\n❌ Found {len(broken_links)} broken references:")
|
|
for error in broken_links:
|
|
print(error)
|
|
sys.exit(1)
|
|
else:
|
|
print(f"\n✅ All {len(references)} structural cross-references are valid.")
|
|
sys.exit(0)
|
|
|
|
if __name__ == "__main__":
|
|
scan_files() |