Files
cs249r_book/book/tools/scripts/_archive/genai-oneoff/header_update.py
Vijay Janapa Reddi e3cc9f7af3 refactor: rename ml_ml_workflow files, consolidate CLI, and clean up scripts
Remove redundant ml_ prefix from ml_workflow chapter files and update all
Quarto config references. Consolidate custom scripts into native binder
subcommands and archive obsolete tooling.
2026-02-13 11:06:28 -05:00

139 lines
5.3 KiB
Python

import os
import re
import yaml
import argparse
from openai import OpenAI
client = OpenAI()
from pathlib import Path
from collections import defaultdict
# -- Set your OpenAI model here
OPENAI_MODEL = "gpt-4o"
# -- Prompt Template
PROMPT_TEMPLATE = """
You are assisting with editing section headers for a textbook on Machine Learning Systems. The headers are extracted from `.qmd` Markdown files. Your task is to revise the headers to be suitable for a professional, technically rigorous textbook.
Please follow these guidelines:
- Concise: Keep headers short (ideally under 5 words), clear, and impactful.
- Hierarchical Awareness: Analyze all headers before editing. Ensure that subheaders (e.g., ###) are meaningfully distinct from their parent headers and do not repeat information unnecessarily.
- Consistent Tone: Use an academic, systems-oriented style. Assume the reader is technically literate but learning the concepts for the first time.
- No Numbering: Do not include chapter or section numbers (e.g., “3.1”).
- No Markdown Changes: Only update the text of the headers, not the Markdown level (#, ##, etc.).
Return your output in the following YAML format:
- original: "## Introduction to Compilation Techniques for Machine Learning"
revised: "## Compilation Techniques"
- original: "### Explaining Why Compilers Matter in ML Pipelines"
revised: "### Why Compilers Matter"
Make sure each header revision respects the hierarchy and flow of the textbook. Do not skip any headers, even if they seem fine — evaluate all.
Here is the full list of headers:
"""
def find_qmd_headers_in_file(file_path):
header_map = []
file_line_map = defaultdict(list)
with file_path.open(encoding="utf-8") as f:
lines = f.readlines()
for i, line in enumerate(lines):
if re.match(r'^\s*#{1,6} ', line):
clean_line = line.strip()
header_map.append(clean_line)
file_line_map[clean_line].append((file_path, i, line))
return header_map, file_line_map
def find_qmd_headers_in_directory(directory):
header_map = []
file_line_map = defaultdict(list)
for path in Path(directory).rglob("*.qmd"):
file_headers, file_map = find_qmd_headers_in_file(path)
header_map.extend(file_headers)
for key, value in file_map.items():
file_line_map[key].extend(value)
return header_map, file_line_map
def call_openai(prompt):
print("[DEBUG] Prompt sent to OpenAI:\n", prompt[:1000], "...\n")
response = client.chat.completions.create(model=OPENAI_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.3)
return response.choices[0].message.content
def apply_revisions(file_line_map, replacements):
for item in replacements:
original = item['original'].strip()
revised = item['revised'].strip()
if original == revised:
continue
if original in file_line_map:
for path, idx, line in file_line_map[original]:
print(f"[DEBUG] Updating in {path}:\n From: {original}\n To: {revised}")
with path.open(encoding="utf-8") as f:
content = f.readlines()
content[idx] = revised + "\n"
with path.open('w', encoding="utf-8") as f:
f.writelines(content)
def strip_yaml_fence(text):
"""Remove surrounding ```yaml ... ``` fence if present."""
lines = text.strip().splitlines()
if lines[0].strip().startswith("```") and lines[-1].strip().startswith("```"):
return "\n".join(lines[1:-1]).strip()
return text.strip()
def main():
if not os.getenv("OPENAI_API_KEY"):
print("[ERROR] OPENAI_API_KEY is not set. Please export it before running.")
exit(1)
parser = argparse.ArgumentParser(description="Revise section headers in .qmd files using OpenAI.")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-d", "--dir", help="Directory to scan for .qmd files")
group.add_argument("-f", "--file", help="Specific .qmd file to process")
args = parser.parse_args()
if args.dir:
print(f"[INFO] Scanning headers in directory {args.dir}")
headers, file_line_map = find_qmd_headers_in_directory(args.dir)
else:
file_path = Path(args.file)
if not file_path.exists() or not file_path.suffix == '.qmd':
print(f"[ERROR] File {args.file} does not exist or is not a .qmd file.")
exit(1)
print(f"[INFO] Scanning headers in file {args.file}")
headers, file_line_map = find_qmd_headers_in_file(file_path)
if not headers:
print("[WARN] No headers found.")
return
header_list = "\n".join(headers)
full_prompt = PROMPT_TEMPLATE.strip() + "\n" + header_list
print("[INFO] Sending headers to OpenAI...")
raw_output = call_openai(full_prompt)
print("[DEBUG] Raw response:\n", raw_output[:1000], "...\n")
print("[INFO] Parsing YAML response...")
try:
cleaned_output = strip_yaml_fence(raw_output)
revisions = yaml.safe_load(cleaned_output)
except yaml.YAMLError as e:
print("[ERROR] Failed to parse YAML:", e)
print(raw_output)
return
print(f"[INFO] Applying {len(revisions)} revisions...")
apply_revisions(file_line_map, revisions)
print("[DONE] All headers updated.")
if __name__ == "__main__":
main()