mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-03 08:08:51 -05:00
Remove redundant ml_ prefix from ml_workflow chapter files and update all Quarto config references. Consolidate custom scripts into native binder subcommands and archive obsolete tooling.
139 lines
5.3 KiB
Python
139 lines
5.3 KiB
Python
import os
|
|
import re
|
|
import yaml
|
|
import argparse
|
|
from openai import OpenAI
|
|
|
|
client = OpenAI()
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
# -- Set your OpenAI model here
|
|
OPENAI_MODEL = "gpt-4o"
|
|
|
|
# -- Prompt Template
|
|
PROMPT_TEMPLATE = """
|
|
You are assisting with editing section headers for a textbook on Machine Learning Systems. The headers are extracted from `.qmd` Markdown files. Your task is to revise the headers to be suitable for a professional, technically rigorous textbook.
|
|
|
|
Please follow these guidelines:
|
|
- Concise: Keep headers short (ideally under 5 words), clear, and impactful.
|
|
- Hierarchical Awareness: Analyze all headers before editing. Ensure that subheaders (e.g., ###) are meaningfully distinct from their parent headers and do not repeat information unnecessarily.
|
|
- Consistent Tone: Use an academic, systems-oriented style. Assume the reader is technically literate but learning the concepts for the first time.
|
|
- No Numbering: Do not include chapter or section numbers (e.g., “3.1”).
|
|
- No Markdown Changes: Only update the text of the headers, not the Markdown level (#, ##, etc.).
|
|
|
|
Return your output in the following YAML format:
|
|
|
|
- original: "## Introduction to Compilation Techniques for Machine Learning"
|
|
revised: "## Compilation Techniques"
|
|
- original: "### Explaining Why Compilers Matter in ML Pipelines"
|
|
revised: "### Why Compilers Matter"
|
|
|
|
Make sure each header revision respects the hierarchy and flow of the textbook. Do not skip any headers, even if they seem fine — evaluate all.
|
|
Here is the full list of headers:
|
|
"""
|
|
|
|
def find_qmd_headers_in_file(file_path):
|
|
header_map = []
|
|
file_line_map = defaultdict(list)
|
|
|
|
with file_path.open(encoding="utf-8") as f:
|
|
lines = f.readlines()
|
|
for i, line in enumerate(lines):
|
|
if re.match(r'^\s*#{1,6} ', line):
|
|
clean_line = line.strip()
|
|
header_map.append(clean_line)
|
|
file_line_map[clean_line].append((file_path, i, line))
|
|
return header_map, file_line_map
|
|
|
|
def find_qmd_headers_in_directory(directory):
|
|
header_map = []
|
|
file_line_map = defaultdict(list)
|
|
|
|
for path in Path(directory).rglob("*.qmd"):
|
|
file_headers, file_map = find_qmd_headers_in_file(path)
|
|
header_map.extend(file_headers)
|
|
for key, value in file_map.items():
|
|
file_line_map[key].extend(value)
|
|
|
|
return header_map, file_line_map
|
|
|
|
def call_openai(prompt):
|
|
print("[DEBUG] Prompt sent to OpenAI:\n", prompt[:1000], "...\n")
|
|
response = client.chat.completions.create(model=OPENAI_MODEL,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
temperature=0.3)
|
|
return response.choices[0].message.content
|
|
|
|
def apply_revisions(file_line_map, replacements):
|
|
for item in replacements:
|
|
original = item['original'].strip()
|
|
revised = item['revised'].strip()
|
|
if original == revised:
|
|
continue
|
|
if original in file_line_map:
|
|
for path, idx, line in file_line_map[original]:
|
|
print(f"[DEBUG] Updating in {path}:\n From: {original}\n To: {revised}")
|
|
with path.open(encoding="utf-8") as f:
|
|
content = f.readlines()
|
|
content[idx] = revised + "\n"
|
|
with path.open('w', encoding="utf-8") as f:
|
|
f.writelines(content)
|
|
|
|
def strip_yaml_fence(text):
|
|
"""Remove surrounding ```yaml ... ``` fence if present."""
|
|
lines = text.strip().splitlines()
|
|
if lines[0].strip().startswith("```") and lines[-1].strip().startswith("```"):
|
|
return "\n".join(lines[1:-1]).strip()
|
|
return text.strip()
|
|
|
|
def main():
|
|
if not os.getenv("OPENAI_API_KEY"):
|
|
print("[ERROR] OPENAI_API_KEY is not set. Please export it before running.")
|
|
exit(1)
|
|
|
|
parser = argparse.ArgumentParser(description="Revise section headers in .qmd files using OpenAI.")
|
|
group = parser.add_mutually_exclusive_group(required=True)
|
|
group.add_argument("-d", "--dir", help="Directory to scan for .qmd files")
|
|
group.add_argument("-f", "--file", help="Specific .qmd file to process")
|
|
args = parser.parse_args()
|
|
|
|
if args.dir:
|
|
print(f"[INFO] Scanning headers in directory {args.dir}")
|
|
headers, file_line_map = find_qmd_headers_in_directory(args.dir)
|
|
else:
|
|
file_path = Path(args.file)
|
|
if not file_path.exists() or not file_path.suffix == '.qmd':
|
|
print(f"[ERROR] File {args.file} does not exist or is not a .qmd file.")
|
|
exit(1)
|
|
print(f"[INFO] Scanning headers in file {args.file}")
|
|
headers, file_line_map = find_qmd_headers_in_file(file_path)
|
|
|
|
if not headers:
|
|
print("[WARN] No headers found.")
|
|
return
|
|
|
|
header_list = "\n".join(headers)
|
|
full_prompt = PROMPT_TEMPLATE.strip() + "\n" + header_list
|
|
|
|
print("[INFO] Sending headers to OpenAI...")
|
|
raw_output = call_openai(full_prompt)
|
|
|
|
print("[DEBUG] Raw response:\n", raw_output[:1000], "...\n")
|
|
|
|
print("[INFO] Parsing YAML response...")
|
|
try:
|
|
cleaned_output = strip_yaml_fence(raw_output)
|
|
revisions = yaml.safe_load(cleaned_output)
|
|
except yaml.YAMLError as e:
|
|
print("[ERROR] Failed to parse YAML:", e)
|
|
print(raw_output)
|
|
return
|
|
|
|
print(f"[INFO] Applying {len(revisions)} revisions...")
|
|
apply_revisions(file_line_map, revisions)
|
|
print("[DONE] All headers updated.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|