Generates changelog entries using git history and OpenAI

This commit introduces a script that automatically generates changelog entries by analyzing git commit history and summarizing changes using OpenAI. The script fetches changes from the `dev` branch since the last gh-pages publish, summarizes the changes using OpenAI, and formats them into a changelog entry. It then prepends the new entry to the existing changelog file. The script uses git commands to get the commit history and OpenAI to summarize the changes. It also includes logic to sort the changes by chapter order and to categorize them as major or minor updates. This enables a more automated and insightful changelog generation process.
2026-05-04 00:29:10 -05:00 · 2025-06-04 19:27:49 -04:00
parent 5a2bfacb44
commit dd144fb4b7
1 changed files with 264 additions and 262 deletions
--- a/.github/scripts/update_changelog.py
+++ b/.github/scripts/update_changelog.py
@@ -1,304 +1,306 @@
 import subprocess
 import re
 import os
+import argparse
+import yaml
 from collections import defaultdict
 from datetime import datetime
+from openai import OpenAI
+
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

 CHANGELOG_FILE = "CHANGELOG.md"
+QUARTO_YML_FILE = "_quarto.yml"
 GITHUB_REPO_URL = "https://github.com/harvard-edge/cs249r_book/"
-MAJOR_CHANGE_THRESHOLD = 200  # Define threshold for major updates
+MAJOR_CHANGE_THRESHOLD = 200

-def format_friendly_date(date_str):
-    """Format the date in a human-friendly way."""
-    try:
-        date_obj = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S %z")
-        return date_obj.strftime("%b %d, %Y")
-    except ValueError:
-        return date_str
+chapter_order = []

-def get_year_from_date(date_str):
-    """Extract year from date string."""
-    try:
-        date_obj = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S %z")
-        return date_obj.year
-    except ValueError:
+chapter_lookup = [
+    ("introduction.qmd", "Introduction", 1),
+    ("ml_systems.qmd", "ML Systems", 2),
+    ("dl_primer.qmd", "DL Primer", 3),
+    ("dnn_architectures.qmd", "DNN Architectures", 4),
+    ("workflow.qmd", "AI Workflow", 5),
+    ("data_engineering.qmd", "Data Engineering", 6),
+    ("frameworks.qmd", "AI Frameworks", 7),
+    ("training.qmd", "AI Training", 8),
+    ("efficient_ai.qmd", "Efficient AI", 9),
+    ("optimizations.qmd", "Model Optimizations", 10),
+    ("hw_acceleration.qmd", "AI Acceleration", 11),
+    ("benchmarking.qmd", "Benchmarking AI", 12),
+    ("ops.qmd", "ML Operations", 13),
+    ("ondevice_learning.qmd", "On-Device Learning", 14),
+    ("privacy_security.qmd", "Security & Privacy", 15),
+    ("responsible_ai.qmd", "Responsible AI", 16),
+    ("sustainable_ai.qmd", "Sustainable AI", 17),
+    ("robust_ai.qmd", "Robust AI", 18),
+    ("ai_for_good.qmd", "AI for Good", 19),
+    ("conclusion.qmd", "Conclusion", 20),
+]
+
+def load_chapter_order():
+    global chapter_order
+    with open(QUARTO_YML_FILE, "r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+
+    def find_chapters(obj):
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                if key == "chapters":
+                    return value
+                result = find_chapters(value)
+                if result:
+                    return result
+        elif isinstance(obj, list):
+            for item in obj:
+                result = find_chapters(item)
+                if result:
+                    return result
        return None

-def run_git_command(cmd):
-    """Run a git command and return the output."""
-    # print(f"🔄 Running git command: {' '.join(cmd)}")
+    def extract_qmd_paths(items):
+        paths = []
+        for item in items:
+            if isinstance(item, str) and item.endswith(".qmd"):
+                paths.append(item)
+            elif isinstance(item, dict):
+                if "chapters" in item:
+                    paths.extend(extract_qmd_paths(item["chapters"]))
+                elif "part" in item and isinstance(item["part"], str):
+                    if item["part"].endswith(".qmd"):
+                        paths.append(item["part"])
+                    if "chapters" in item:
+                        paths.extend(extract_qmd_paths(item["chapters"]))
+        return paths
+
+    chapters_section = find_chapters(data)
+    chapter_order = extract_qmd_paths(chapters_section) if chapters_section else []
+
+    print(f"📚 Loaded {len(chapter_order)} chapters from _quarto.yml")
+
+def run_git_command(cmd, verbose=False):
+    if verbose:
+        print(f"📦 Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
-        print(f"❌ Error running command: {' '.join(cmd)}")
-        print(result.stderr)
-        raise SystemExit(f"Git command failed: {' '.join(cmd)}")
+        raise RuntimeError(f"Git command failed: {' '.join(cmd)}\n{result.stderr}")
    return result.stdout.strip()

 def extract_chapter_title(file_path):
-    """Extract the chapter title from the QMD file, handling path changes."""
-    # Get the base filename without path and extension
-    base_name = os.path.basename(file_path).replace('.qmd', '')
-    
-    try:
-        # Try to find the file in git history by searching for any path containing the filename
-        cmd = ["git", "ls-tree", "-r", "--name-only", "dev"]
-        all_files = run_git_command(cmd).split('\n')
-        
-        # Find any .qmd file that matches our base filename
-        matching_files = [f for f in all_files if f.endswith('.qmd') and os.path.basename(f).replace('.qmd', '') == base_name]
-        
-        if matching_files:
-            # Use the most recent matching file
-            current_file = matching_files[0]
-            try:
-                content = run_git_command(["git", "show", f"dev:{current_file}"])
-                for line in content.split('\n'):
-                    if line.startswith("#"):
-                        match = re.match(r"^#\s+(.*?)\s*(?:{.*)?$", line.strip())
-                        if match:
-                            return match.group(1).strip()
-            except SystemExit:
-                print(f"📝 Note: Couldn't read content of {current_file}")
-    except SystemExit:
-        print(f"📝 Note: Couldn't search git history for {base_name}")
-    
-    # If we couldn't find or read the file, create a title from the filename
-    readable_title = base_name.replace('_', ' ').title()
-    # print(f"📝 Note: Using generated title '{readable_title}' for {file_path}")
-    return readable_title
+    base = os.path.basename(file_path)
+    for fname, title, number in chapter_lookup:
+        if fname == base:
+            return f"Chapter {number}: {title}"
+    return base.replace('_', ' ').replace('.qmd', '').title()

-def get_changes_in_dev_since(date_start, date_end=None):
-    """Get changes in the dev branch since a given date."""
+def sort_by_chapter_order(updates):
+    def extract_path(update):
+        match = re.search(r'\*\*(.*?)\*\*', update)
+        if match:
+            title = match.group(1)
+            for path in chapter_order:
+                if title.lower().replace(' ', '_') in path.lower():
+                    return chapter_order.index(path)
+        return float('inf')
+    return sorted(updates, key=extract_path)
+
+def get_changes_in_dev_since(date_start, date_end=None, verbose=False):
    cmd = ["git", "log", "--numstat", "--since", date_start]
    if date_end:
        cmd += ["--until", date_end]
-    # Look for .qmd files anywhere in the contents directory and its subdirectories
    cmd += ["origin/dev", "--", "contents/**/*.qmd"]
-    return run_git_command(cmd)
+    return run_git_command(cmd, verbose=verbose)

-def generate_change_visual(added, removed, max_length=6):
-    """Generate a visual representation of changes."""
-    total = added + removed
-    if total == 0:
-        return ""
-    added_blocks = int((added / total) * max_length) if total > 0 else 0
-    removed_blocks = int((removed / total) * max_length) if total > 0 else 0
-    added_bars = f'<span style="color:green">{"+" * added_blocks}</span>'
-    removed_bars = f'<span style="color:red">{"-" * removed_blocks}</span>'
-    return f"{added_bars}{removed_bars}"
+def get_commit_messages_for_file(file_path, since, until=None, verbose=False):
+    cmd = ["git", "log", "--pretty=format:%s", "--since", since]
+    if until:
+        cmd += ["--until", until]
+    cmd += ["origin/dev", "--", file_path]
+    return run_git_command(cmd, verbose=verbose)

-def get_latest_gh_pages_commit():
-    """Get the latest gh-pages commit hash and date."""
-    cmd = ["git", "--no-pager", "log", "-1", "--pretty=format:%H %ad", "--date=iso", "origin/gh-pages"]
-    output = run_git_command(cmd)
-    parts = output.split(" ", 1)
-    if len(parts) == 2:
-        return parts[0], parts[1]
-    return None, None
+def summarize_changes_with_openai(file_path, commit_messages, verbose=False):
+    chapter_title = extract_chapter_title(file_path)
+    if verbose:
+        print(f"🤖 Calling OpenAI for: {file_path} -- {chapter_title}")

-def generate_changelog():
-    """Generate the changelog content."""
-    print("🚀 Starting changelog generation...")
+    prompt = f"""You're helping to generate a changelog for a machine learning systems textbook.
+The following file has been updated: {file_path}

-    intro_text = (
-        f"_Last Updated: {datetime.now().strftime('%b %d, %Y')}_\n\n"
+Here are the commit messages:
+{commit_messages}
+
+Summarize the meaningful content-level changes (new sections, rewrites, example additions, figure changes).
+Ignore formatting or typo-only changes.
+
+Only return the summary sentence (not the bullet or chapter title)."""
+
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4",
+            temperature=0.3,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant writing changelog summaries."},
+                {"role": "user", "content": prompt}
+            ]
+        )
+
+        summary = response.choices[0].message.content.strip()
+
+        if not summary:
+            return f"- **{chapter_title}**: _(no meaningful changes detected)_"
+
+        clean_summary = summary.partition(":")[-1].strip()
+        if not clean_summary:
+            clean_summary = summary  # fallback if no colon was present
+
+        return f"- **{chapter_title}**: {clean_summary}"
+
+    except Exception as e:
+        print(f"⚠️ OpenAI failed for {file_path}: {e}")
+        return f"- **{chapter_title}**: _(unable to summarize; see commits manually)_"
+
+
+def format_friendly_date(date_str):
+    try:
+        return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S %z").strftime("%b %d, %Y")
+    except:
+        return date_str
+
+def normalized_path(path):
+    return os.path.normpath(path).lower()
+
+def generate_entry(start_date, end_date=None, verbose=False):
+    if verbose:
+        print(f"📁 Processing changes from {start_date} to {end_date or 'now'}")
+    changes = get_changes_in_dev_since(start_date, end_date, verbose=verbose)
+    if not changes.strip():
+        return None
+
+    changes_by_file = defaultdict(lambda: [0, 0])
+    for line in changes.splitlines():
+        parts = line.split("\t")
+        if len(parts) != 3:
+            continue
+        added, removed, file_path = parts
+        added = int(added) if added.isdigit() else 0
+        removed = int(removed) if removed.isdigit() else 0
+        changes_by_file[file_path][0] += added
+        changes_by_file[file_path][1] += removed
+
+    current_date = datetime.now().strftime('%b %d, %Y') if not end_date else format_friendly_date(end_date)
+    entry = f"### 📅 Published on {current_date}\n\n"
+
+    major, minor = [], []
+
+    ordered_files = sorted(
+        changes_by_file,
+        key=lambda f: next(
+            (i for i, ch in enumerate(chapter_order) if normalized_path(f).endswith(normalized_path(ch))),
+            float('inf')
+        )
    )

-    # Check if gh-pages branch exists
-    print("🔍 Checking for gh-pages branch...")
-    gh_pages_exists = run_git_command(["git", "ls-remote", "--heads", "origin", "gh-pages"])
-    if not gh_pages_exists:
-        raise SystemExit("❌ Error: `gh-pages` branch not found on the remote. The changelog generation process requires this branch to exist.")
-    
-    print("✅ gh-pages branch found")
+    for file_path in ordered_files:
+        total = added + removed
+        if verbose:
+            print(f"🔍 Summarizing {file_path} ({added}+ / {removed}-)")
+        commit_msgs = get_commit_messages_for_file(file_path, start_date, end_date, verbose=verbose)
+        summary = summarize_changes_with_openai(file_path, commit_msgs, verbose=verbose)
+        if total > MAJOR_CHANGE_THRESHOLD:
+            major.append(summary)
+        else:
+            minor.append(summary)

-    # Fetch the gh-pages branch
-    print("📥 Fetching gh-pages branch...")
-    run_git_command(["git", "fetch", "origin", "gh-pages:refs/remotes/origin/gh-pages"])
-    
-    # Fetch the dev branch
-    print("📥 Fetching dev branch...")
-    run_git_command(["git", "fetch", "origin", "dev:refs/remotes/origin/dev"])
+    if major:
+        entry += "<details open>\n<summary>**Major Updates**</summary>\n\n" + "\n".join(sort_by_chapter_order(major)) + "\n\n</details>\n\n"
+    if minor:
+        entry += "<details open>\n<summary>**Minor Updates**</summary>\n\n" + "\n".join(sort_by_chapter_order(minor)) + "\n\n</details>\n"

-    # Get commit history
-    print("📚 Getting commit history...")
-    commits_with_dates = run_git_command([
-        "git", "--no-pager", "log", "--pretty=format:%H %ad", "--date=iso", "origin/gh-pages"
-    ]).split("\n")
+    return entry

-    if not commits_with_dates:
-        return intro_text + "_No `gh-pages` commits found._"
+def generate_changelog(mode="incremental", verbose=False):
+    run_git_command(["git", "fetch", "origin", "gh-pages:refs/remotes/origin/gh-pages"], verbose=verbose)
+    run_git_command(["git", "fetch", "origin", "dev:refs/remotes/origin/dev"], verbose=verbose)

-    print(f"📊 Found {len(commits_with_dates)} commits to process")
+    def get_latest_gh_pages_commit():
+        output = run_git_command(["git", "log", "-1", "--pretty=format:%H %ad", "--date=iso", "origin/gh-pages"], verbose=verbose)
+        parts = output.split(" ", 1)
+        return (parts[0], parts[1]) if len(parts) == 2 else (None, None)

-    # Parse commits and dates
-    commits_with_dates = [(line.split(" ")[0], " ".join(line.split(" ")[1:])) for line in commits_with_dates]
-    
-    # Group changes by year
-    changes_by_year = defaultdict(list)
-    current_year = datetime.now().year
-    first_details_created = False
-
-    # Add entry for pending changes (changes in dev since last gh-pages commit)
    latest_commit, latest_date = get_latest_gh_pages_commit()
-    if latest_commit and latest_date:
-        print("🔍 Checking for changes since last gh-pages commit...")
-        
-        # Get changes between latest gh-pages commit and current dev
-        current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S %z")
-        pending_changes = get_changes_in_dev_since(latest_date)
-        
-        if pending_changes.strip():
-            print("🆕 Found changes to include")
-            changes_by_file = defaultdict(lambda: [0, 0])
-            
-            for line in pending_changes.split("\n"):
-                parts = line.split("\t")
-                if len(parts) != 3:
-                    continue
+    intro = f""
+    sections = []

-                added, removed, file_path = parts
-                added = int(added) if added.isdigit() else 0
-                removed = int(removed) if removed.isdigit() else 0
-                changes_by_file[file_path][0] += added
-                changes_by_file[file_path][1] += removed
+    if mode == "full":
+        if verbose:
+            print("🔁 Running full regeneration...")
+        commits = run_git_command(["git", "log", "--pretty=format:%H %ad", "--date=iso", "origin/gh-pages"], verbose=verbose).splitlines()
+        commits = [(c.split(" ")[0], " ".join(c.split(" ")[1:])) for c in commits]
+        for i in range(len(commits) - 1):
+            entry = generate_entry(commits[i + 1][1], commits[i][1], verbose=verbose)
+            if entry:
+                sections.append(entry)
+    else:
+        if verbose:
+            print("⚡ Running incremental update...")
+        entry = generate_entry(latest_date, verbose=verbose)
+        if entry:
+            sections.append(entry)

-            # Create change entry for current changes (using today's date)
-            change_entry = f"### 📅 Published on {datetime.now().strftime('%b %d, %Y')}\n\n"
+    if not sections:
+        return intro + "_No updates found._"

-            total_added = sum(added for added, _ in changes_by_file.values())
-            total_removed = sum(removed for _, removed in changes_by_file.values())
-            total_files = len(changes_by_file)
-
-            change_entry += f"{total_files} files updated "
-            change_entry += f"({total_added} lines added, {total_removed} lines removed)\n\n"
-
-            # Separate Major and Minor Updates
-            major_updates = []
-            minor_updates = []
-
-            for file_path, (added, removed) in changes_by_file.items():
-                chapter_title = extract_chapter_title(file_path)
-                total_changes = added + removed
-                change_visual = generate_change_visual(added, removed)
-                if total_changes > MAJOR_CHANGE_THRESHOLD:
-                    major_updates.append(f"- **{chapter_title}**: {change_visual} ({added} lines added, {removed} lines removed)")
-                else:
-                    minor_updates.append(f"- **{chapter_title}**: {change_visual} ({added} lines added, {removed} lines removed)")
-
-            if major_updates:
-                change_entry += f"<details open>\n"
-                change_entry += "  <summary>**Major Updates**</summary>\n\n"
-                change_entry += "\n".join(sorted(major_updates)) + "\n\n"
-                change_entry += "</details>\n\n"
-
-            if minor_updates:
-                change_entry += f"<details open>\n"
-                change_entry += "  <summary>**Minor Updates**</summary>\n\n"
-                change_entry += "\n".join(sorted(minor_updates)) + "\n\n"
-                change_entry += "</details>\n"
-
-            changes_by_year[current_year].insert(0, change_entry)
-
-    print("🔄 Processing historical commits...")
-    for i in range(len(commits_with_dates) - 1):
-        current_commit, current_date = commits_with_dates[i]
-        previous_commit, previous_date = commits_with_dates[i + 1]
-        
-        # print(f"📝 Processing changes between {format_friendly_date(previous_date)} and {format_friendly_date(current_date)}")
-        
-        year = get_year_from_date(current_date)
-        if not year:
-            continue
-
-        changes = get_changes_in_dev_since(previous_date, current_date)
-        if not changes.strip():
-            continue
-
-        changes_by_file = defaultdict(lambda: [0, 0])
-        for line in changes.split("\n"):
-            parts = line.split("\t")
-            if len(parts) != 3:
-                continue
-
-            added, removed, file_path = parts
-            added = int(added) if added.isdigit() else 0
-            removed = int(removed) if removed.isdigit() else 0
-            changes_by_file[file_path][0] += added
-            changes_by_file[file_path][1] += removed
-
-        # Generate diff link
-        full_diff_link = f"{GITHUB_REPO_URL}/compare/{previous_commit}...{current_commit}"
-        
-        # Create change entry
-        change_entry = f"### 📅 Published on {format_friendly_date(current_date)}\n\n"
-        change_entry += f"🔗 [View Full Diff]({full_diff_link}) --- "
-
-        total_added = sum(added for added, _ in changes_by_file.values())
-        total_removed = sum(removed for _, removed in changes_by_file.values())
-        total_files = len(changes_by_file)
-
-        change_entry += f"{total_files} files updated "
-        change_entry += f"({total_added} lines added, {total_removed} lines removed)\n\n"
-
-        # Separate Major and Minor Updates
-        major_updates = []
-        minor_updates = []
-
-        for file_path, (added, removed) in changes_by_file.items():
-            chapter_title = extract_chapter_title(file_path)
-            total_changes = added + removed
-            change_visual = generate_change_visual(added, removed)
-            if total_changes > MAJOR_CHANGE_THRESHOLD:
-                major_updates.append(f"- **{chapter_title}**: {change_visual} ({added} lines added, {removed} lines removed)")
-            else:
-                minor_updates.append(f"- **{chapter_title}**: {change_visual} ({added} lines added, {removed} lines removed)")
-
-        if major_updates:
-            details_open = " open" if not first_details_created and not i else ""
-            change_entry += f"<details{details_open}>\n"
-            change_entry += "  <summary>**Major Updates**</summary>\n\n"
-            change_entry += "\n".join(sorted(major_updates)) + "\n\n"
-            change_entry += "</details>\n\n"
-
-        if minor_updates:
-            details_open = " open" if not first_details_created and not i and not major_updates else ""
-            change_entry += f"<details{details_open}>\n"
-            change_entry += "  <summary>**Minor Updates**</summary>\n\n"
-            change_entry += "\n".join(sorted(minor_updates)) + "\n\n"
-            change_entry += "</details>\n"
-
-        first_details_created = True
-        changes_by_year[year].append(change_entry)
-        
-    print("✨ Finalizing changelog...")
-    summary = intro_text
-    for year in sorted(changes_by_year.keys(), reverse=True):
-        print(f"📅 Adding entries for {year}")
-        year_summary = f"## {year} Changes\n\n"
-        year_summary += "\n".join(changes_by_year[year])        
-        summary += year_summary
-        summary += "\n---\n\n"
-
-    return summary.strip()
+    year = datetime.now().year
+    return "\n\n".join(sections) + "\n"

 if __name__ == "__main__":
-    print("🗑️ Removing old changelog file if it exists...")
-    if os.path.exists(CHANGELOG_FILE):
-        try:
-            os.remove(CHANGELOG_FILE)
-            print(f"✅ Successfully removed old {CHANGELOG_FILE}")
-        except Exception as e:
-            print(f"❌ Error removing old changelog: {str(e)}")
-            raise SystemExit("Failed to remove old changelog file")
+    parser = argparse.ArgumentParser(description="Generate changelog for ML systems book.")
+    parser.add_argument("-i", "--incremental", action="store_true", help="Add new entries since last gh-pages publish (default).")
+    parser.add_argument("-f", "--full", action="store_true", help="Regenerate the entire changelog from scratch.")
+    parser.add_argument("-t", "--test", action="store_true", help="Run without writing to file.")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output.")
+
+    args = parser.parse_args()
+    mode = "incremental"
+    if args.full:
+        mode = "full"

-    print("📝 Generating new changelog...")
-    changelog = generate_changelog()
-    
    try:
-        with open(CHANGELOG_FILE, "w", encoding="utf-8") as f:
-            f.write(changelog + "\n")
-        print(f"✅ Changelog successfully generated in {CHANGELOG_FILE}")
+        load_chapter_order()
+        new_entry = generate_changelog(mode=mode, verbose=args.verbose)  # returns just the `📅 Published on ...` block
+
+        if args.test:
+            print("🧪 TEST OUTPUT ONLY:\n")
+            print(new_entry)
+        else:
+            existing = ""
+            if os.path.exists(CHANGELOG_FILE):
+                with open(CHANGELOG_FILE, "r", encoding="utf-8") as f:
+                    existing = f.read()
+
+            current_year = datetime.now().year
+            year_header = f"## {current_year} Changes"
+
+            # Remove first occurrence of the year header
+            existing_lines = existing.splitlines()
+            filtered_lines = []
+            found = False
+            for line in existing_lines:
+                if not found and line.strip() == year_header:
+                    found = True
+                    continue  # skip this one line only
+                filtered_lines.append(line)
+
+            cleaned_existing = "\n".join(filtered_lines).strip()
+
+            # Prepend the new entry with correct year section
+            updated_content = f"{year_header}\n\n{new_entry.strip()}\n---\n\n{cleaned_existing}"
+
+            with open(CHANGELOG_FILE, "w", encoding="utf-8") as f:
+                f.write(updated_content.strip() + "\n")
+
+            print(f"\n✅ Changelog written to {CHANGELOG_FILE}")
    except Exception as e:
-        print(f"❌ Error writing changelog: {str(e)}")
-        raise SystemExit("Failed to write new changelog file")
+        print(f"❌ Error: {e}")
+