Generates changelog entries using git history and OpenAI

This commit introduces a script that automatically generates changelog entries by analyzing git commit history and summarizing changes using OpenAI.

The script fetches changes from the `dev` branch since the last gh-pages publish, summarizes the changes using OpenAI, and formats them into a changelog entry. It then prepends the new entry to the existing changelog file.

The script uses git commands to get the commit history and OpenAI to summarize the changes. It also includes logic to sort the changes by chapter order and to categorize them as major or minor updates.

This enables a more automated and insightful changelog generation process.
This commit is contained in:
Vijay Janapa Reddi
2025-06-04 19:27:49 -04:00
parent 5a2bfacb44
commit dd144fb4b7

View File

@@ -1,304 +1,306 @@
import subprocess
import re
import os
import argparse
import yaml
from collections import defaultdict
from datetime import datetime
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
CHANGELOG_FILE = "CHANGELOG.md"
QUARTO_YML_FILE = "_quarto.yml"
GITHUB_REPO_URL = "https://github.com/harvard-edge/cs249r_book/"
MAJOR_CHANGE_THRESHOLD = 200 # Define threshold for major updates
MAJOR_CHANGE_THRESHOLD = 200
def format_friendly_date(date_str):
"""Format the date in a human-friendly way."""
try:
date_obj = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S %z")
return date_obj.strftime("%b %d, %Y")
except ValueError:
return date_str
chapter_order = []
def get_year_from_date(date_str):
"""Extract year from date string."""
try:
date_obj = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S %z")
return date_obj.year
except ValueError:
chapter_lookup = [
("introduction.qmd", "Introduction", 1),
("ml_systems.qmd", "ML Systems", 2),
("dl_primer.qmd", "DL Primer", 3),
("dnn_architectures.qmd", "DNN Architectures", 4),
("workflow.qmd", "AI Workflow", 5),
("data_engineering.qmd", "Data Engineering", 6),
("frameworks.qmd", "AI Frameworks", 7),
("training.qmd", "AI Training", 8),
("efficient_ai.qmd", "Efficient AI", 9),
("optimizations.qmd", "Model Optimizations", 10),
("hw_acceleration.qmd", "AI Acceleration", 11),
("benchmarking.qmd", "Benchmarking AI", 12),
("ops.qmd", "ML Operations", 13),
("ondevice_learning.qmd", "On-Device Learning", 14),
("privacy_security.qmd", "Security & Privacy", 15),
("responsible_ai.qmd", "Responsible AI", 16),
("sustainable_ai.qmd", "Sustainable AI", 17),
("robust_ai.qmd", "Robust AI", 18),
("ai_for_good.qmd", "AI for Good", 19),
("conclusion.qmd", "Conclusion", 20),
]
def load_chapter_order():
global chapter_order
with open(QUARTO_YML_FILE, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
def find_chapters(obj):
if isinstance(obj, dict):
for key, value in obj.items():
if key == "chapters":
return value
result = find_chapters(value)
if result:
return result
elif isinstance(obj, list):
for item in obj:
result = find_chapters(item)
if result:
return result
return None
def run_git_command(cmd):
"""Run a git command and return the output."""
# print(f"🔄 Running git command: {' '.join(cmd)}")
def extract_qmd_paths(items):
paths = []
for item in items:
if isinstance(item, str) and item.endswith(".qmd"):
paths.append(item)
elif isinstance(item, dict):
if "chapters" in item:
paths.extend(extract_qmd_paths(item["chapters"]))
elif "part" in item and isinstance(item["part"], str):
if item["part"].endswith(".qmd"):
paths.append(item["part"])
if "chapters" in item:
paths.extend(extract_qmd_paths(item["chapters"]))
return paths
chapters_section = find_chapters(data)
chapter_order = extract_qmd_paths(chapters_section) if chapters_section else []
print(f"📚 Loaded {len(chapter_order)} chapters from _quarto.yml")
def run_git_command(cmd, verbose=False):
if verbose:
print(f"📦 Running: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ Error running command: {' '.join(cmd)}")
print(result.stderr)
raise SystemExit(f"Git command failed: {' '.join(cmd)}")
raise RuntimeError(f"Git command failed: {' '.join(cmd)}\n{result.stderr}")
return result.stdout.strip()
def extract_chapter_title(file_path):
"""Extract the chapter title from the QMD file, handling path changes."""
# Get the base filename without path and extension
base_name = os.path.basename(file_path).replace('.qmd', '')
try:
# Try to find the file in git history by searching for any path containing the filename
cmd = ["git", "ls-tree", "-r", "--name-only", "dev"]
all_files = run_git_command(cmd).split('\n')
# Find any .qmd file that matches our base filename
matching_files = [f for f in all_files if f.endswith('.qmd') and os.path.basename(f).replace('.qmd', '') == base_name]
if matching_files:
# Use the most recent matching file
current_file = matching_files[0]
try:
content = run_git_command(["git", "show", f"dev:{current_file}"])
for line in content.split('\n'):
if line.startswith("#"):
match = re.match(r"^#\s+(.*?)\s*(?:{.*)?$", line.strip())
if match:
return match.group(1).strip()
except SystemExit:
print(f"📝 Note: Couldn't read content of {current_file}")
except SystemExit:
print(f"📝 Note: Couldn't search git history for {base_name}")
# If we couldn't find or read the file, create a title from the filename
readable_title = base_name.replace('_', ' ').title()
# print(f"📝 Note: Using generated title '{readable_title}' for {file_path}")
return readable_title
base = os.path.basename(file_path)
for fname, title, number in chapter_lookup:
if fname == base:
return f"Chapter {number}: {title}"
return base.replace('_', ' ').replace('.qmd', '').title()
def get_changes_in_dev_since(date_start, date_end=None):
"""Get changes in the dev branch since a given date."""
def sort_by_chapter_order(updates):
def extract_path(update):
match = re.search(r'\*\*(.*?)\*\*', update)
if match:
title = match.group(1)
for path in chapter_order:
if title.lower().replace(' ', '_') in path.lower():
return chapter_order.index(path)
return float('inf')
return sorted(updates, key=extract_path)
def get_changes_in_dev_since(date_start, date_end=None, verbose=False):
cmd = ["git", "log", "--numstat", "--since", date_start]
if date_end:
cmd += ["--until", date_end]
# Look for .qmd files anywhere in the contents directory and its subdirectories
cmd += ["origin/dev", "--", "contents/**/*.qmd"]
return run_git_command(cmd)
return run_git_command(cmd, verbose=verbose)
def generate_change_visual(added, removed, max_length=6):
"""Generate a visual representation of changes."""
total = added + removed
if total == 0:
return ""
added_blocks = int((added / total) * max_length) if total > 0 else 0
removed_blocks = int((removed / total) * max_length) if total > 0 else 0
added_bars = f'<span style="color:green">{"+" * added_blocks}</span>'
removed_bars = f'<span style="color:red">{"-" * removed_blocks}</span>'
return f"{added_bars}{removed_bars}"
def get_commit_messages_for_file(file_path, since, until=None, verbose=False):
cmd = ["git", "log", "--pretty=format:%s", "--since", since]
if until:
cmd += ["--until", until]
cmd += ["origin/dev", "--", file_path]
return run_git_command(cmd, verbose=verbose)
def get_latest_gh_pages_commit():
"""Get the latest gh-pages commit hash and date."""
cmd = ["git", "--no-pager", "log", "-1", "--pretty=format:%H %ad", "--date=iso", "origin/gh-pages"]
output = run_git_command(cmd)
parts = output.split(" ", 1)
if len(parts) == 2:
return parts[0], parts[1]
return None, None
def summarize_changes_with_openai(file_path, commit_messages, verbose=False):
chapter_title = extract_chapter_title(file_path)
if verbose:
print(f"🤖 Calling OpenAI for: {file_path} -- {chapter_title}")
def generate_changelog():
"""Generate the changelog content."""
print("🚀 Starting changelog generation...")
prompt = f"""You're helping to generate a changelog for a machine learning systems textbook.
The following file has been updated: {file_path}
intro_text = (
f"_Last Updated: {datetime.now().strftime('%b %d, %Y')}_\n\n"
Here are the commit messages:
{commit_messages}
Summarize the meaningful content-level changes (new sections, rewrites, example additions, figure changes).
Ignore formatting or typo-only changes.
Only return the summary sentence (not the bullet or chapter title)."""
try:
response = client.chat.completions.create(
model="gpt-4",
temperature=0.3,
messages=[
{"role": "system", "content": "You are a helpful assistant writing changelog summaries."},
{"role": "user", "content": prompt}
]
)
summary = response.choices[0].message.content.strip()
if not summary:
return f"- **{chapter_title}**: _(no meaningful changes detected)_"
clean_summary = summary.partition(":")[-1].strip()
if not clean_summary:
clean_summary = summary # fallback if no colon was present
return f"- **{chapter_title}**: {clean_summary}"
except Exception as e:
print(f"⚠️ OpenAI failed for {file_path}: {e}")
return f"- **{chapter_title}**: _(unable to summarize; see commits manually)_"
def format_friendly_date(date_str):
try:
return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S %z").strftime("%b %d, %Y")
except:
return date_str
def normalized_path(path):
return os.path.normpath(path).lower()
def generate_entry(start_date, end_date=None, verbose=False):
if verbose:
print(f"📁 Processing changes from {start_date} to {end_date or 'now'}")
changes = get_changes_in_dev_since(start_date, end_date, verbose=verbose)
if not changes.strip():
return None
changes_by_file = defaultdict(lambda: [0, 0])
for line in changes.splitlines():
parts = line.split("\t")
if len(parts) != 3:
continue
added, removed, file_path = parts
added = int(added) if added.isdigit() else 0
removed = int(removed) if removed.isdigit() else 0
changes_by_file[file_path][0] += added
changes_by_file[file_path][1] += removed
current_date = datetime.now().strftime('%b %d, %Y') if not end_date else format_friendly_date(end_date)
entry = f"### 📅 Published on {current_date}\n\n"
major, minor = [], []
ordered_files = sorted(
changes_by_file,
key=lambda f: next(
(i for i, ch in enumerate(chapter_order) if normalized_path(f).endswith(normalized_path(ch))),
float('inf')
)
)
# Check if gh-pages branch exists
print("🔍 Checking for gh-pages branch...")
gh_pages_exists = run_git_command(["git", "ls-remote", "--heads", "origin", "gh-pages"])
if not gh_pages_exists:
raise SystemExit("❌ Error: `gh-pages` branch not found on the remote. The changelog generation process requires this branch to exist.")
print("✅ gh-pages branch found")
for file_path in ordered_files:
total = added + removed
if verbose:
print(f"🔍 Summarizing {file_path} ({added}+ / {removed}-)")
commit_msgs = get_commit_messages_for_file(file_path, start_date, end_date, verbose=verbose)
summary = summarize_changes_with_openai(file_path, commit_msgs, verbose=verbose)
if total > MAJOR_CHANGE_THRESHOLD:
major.append(summary)
else:
minor.append(summary)
# Fetch the gh-pages branch
print("📥 Fetching gh-pages branch...")
run_git_command(["git", "fetch", "origin", "gh-pages:refs/remotes/origin/gh-pages"])
# Fetch the dev branch
print("📥 Fetching dev branch...")
run_git_command(["git", "fetch", "origin", "dev:refs/remotes/origin/dev"])
if major:
entry += "<details open>\n<summary>**Major Updates**</summary>\n\n" + "\n".join(sort_by_chapter_order(major)) + "\n\n</details>\n\n"
if minor:
entry += "<details open>\n<summary>**Minor Updates**</summary>\n\n" + "\n".join(sort_by_chapter_order(minor)) + "\n\n</details>\n"
# Get commit history
print("📚 Getting commit history...")
commits_with_dates = run_git_command([
"git", "--no-pager", "log", "--pretty=format:%H %ad", "--date=iso", "origin/gh-pages"
]).split("\n")
return entry
if not commits_with_dates:
return intro_text + "_No `gh-pages` commits found._"
def generate_changelog(mode="incremental", verbose=False):
run_git_command(["git", "fetch", "origin", "gh-pages:refs/remotes/origin/gh-pages"], verbose=verbose)
run_git_command(["git", "fetch", "origin", "dev:refs/remotes/origin/dev"], verbose=verbose)
print(f"📊 Found {len(commits_with_dates)} commits to process")
def get_latest_gh_pages_commit():
output = run_git_command(["git", "log", "-1", "--pretty=format:%H %ad", "--date=iso", "origin/gh-pages"], verbose=verbose)
parts = output.split(" ", 1)
return (parts[0], parts[1]) if len(parts) == 2 else (None, None)
# Parse commits and dates
commits_with_dates = [(line.split(" ")[0], " ".join(line.split(" ")[1:])) for line in commits_with_dates]
# Group changes by year
changes_by_year = defaultdict(list)
current_year = datetime.now().year
first_details_created = False
# Add entry for pending changes (changes in dev since last gh-pages commit)
latest_commit, latest_date = get_latest_gh_pages_commit()
if latest_commit and latest_date:
print("🔍 Checking for changes since last gh-pages commit...")
# Get changes between latest gh-pages commit and current dev
current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S %z")
pending_changes = get_changes_in_dev_since(latest_date)
if pending_changes.strip():
print("🆕 Found changes to include")
changes_by_file = defaultdict(lambda: [0, 0])
for line in pending_changes.split("\n"):
parts = line.split("\t")
if len(parts) != 3:
continue
intro = f""
sections = []
added, removed, file_path = parts
added = int(added) if added.isdigit() else 0
removed = int(removed) if removed.isdigit() else 0
changes_by_file[file_path][0] += added
changes_by_file[file_path][1] += removed
if mode == "full":
if verbose:
print("🔁 Running full regeneration...")
commits = run_git_command(["git", "log", "--pretty=format:%H %ad", "--date=iso", "origin/gh-pages"], verbose=verbose).splitlines()
commits = [(c.split(" ")[0], " ".join(c.split(" ")[1:])) for c in commits]
for i in range(len(commits) - 1):
entry = generate_entry(commits[i + 1][1], commits[i][1], verbose=verbose)
if entry:
sections.append(entry)
else:
if verbose:
print("⚡ Running incremental update...")
entry = generate_entry(latest_date, verbose=verbose)
if entry:
sections.append(entry)
# Create change entry for current changes (using today's date)
change_entry = f"### 📅 Published on {datetime.now().strftime('%b %d, %Y')}\n\n"
if not sections:
return intro + "_No updates found._"
total_added = sum(added for added, _ in changes_by_file.values())
total_removed = sum(removed for _, removed in changes_by_file.values())
total_files = len(changes_by_file)
change_entry += f"{total_files} files updated "
change_entry += f"({total_added} lines added, {total_removed} lines removed)\n\n"
# Separate Major and Minor Updates
major_updates = []
minor_updates = []
for file_path, (added, removed) in changes_by_file.items():
chapter_title = extract_chapter_title(file_path)
total_changes = added + removed
change_visual = generate_change_visual(added, removed)
if total_changes > MAJOR_CHANGE_THRESHOLD:
major_updates.append(f"- **{chapter_title}**: {change_visual} ({added} lines added, {removed} lines removed)")
else:
minor_updates.append(f"- **{chapter_title}**: {change_visual} ({added} lines added, {removed} lines removed)")
if major_updates:
change_entry += f"<details open>\n"
change_entry += " <summary>**Major Updates**</summary>\n\n"
change_entry += "\n".join(sorted(major_updates)) + "\n\n"
change_entry += "</details>\n\n"
if minor_updates:
change_entry += f"<details open>\n"
change_entry += " <summary>**Minor Updates**</summary>\n\n"
change_entry += "\n".join(sorted(minor_updates)) + "\n\n"
change_entry += "</details>\n"
changes_by_year[current_year].insert(0, change_entry)
print("🔄 Processing historical commits...")
for i in range(len(commits_with_dates) - 1):
current_commit, current_date = commits_with_dates[i]
previous_commit, previous_date = commits_with_dates[i + 1]
# print(f"📝 Processing changes between {format_friendly_date(previous_date)} and {format_friendly_date(current_date)}")
year = get_year_from_date(current_date)
if not year:
continue
changes = get_changes_in_dev_since(previous_date, current_date)
if not changes.strip():
continue
changes_by_file = defaultdict(lambda: [0, 0])
for line in changes.split("\n"):
parts = line.split("\t")
if len(parts) != 3:
continue
added, removed, file_path = parts
added = int(added) if added.isdigit() else 0
removed = int(removed) if removed.isdigit() else 0
changes_by_file[file_path][0] += added
changes_by_file[file_path][1] += removed
# Generate diff link
full_diff_link = f"{GITHUB_REPO_URL}/compare/{previous_commit}...{current_commit}"
# Create change entry
change_entry = f"### 📅 Published on {format_friendly_date(current_date)}\n\n"
change_entry += f"🔗 [View Full Diff]({full_diff_link}) --- "
total_added = sum(added for added, _ in changes_by_file.values())
total_removed = sum(removed for _, removed in changes_by_file.values())
total_files = len(changes_by_file)
change_entry += f"{total_files} files updated "
change_entry += f"({total_added} lines added, {total_removed} lines removed)\n\n"
# Separate Major and Minor Updates
major_updates = []
minor_updates = []
for file_path, (added, removed) in changes_by_file.items():
chapter_title = extract_chapter_title(file_path)
total_changes = added + removed
change_visual = generate_change_visual(added, removed)
if total_changes > MAJOR_CHANGE_THRESHOLD:
major_updates.append(f"- **{chapter_title}**: {change_visual} ({added} lines added, {removed} lines removed)")
else:
minor_updates.append(f"- **{chapter_title}**: {change_visual} ({added} lines added, {removed} lines removed)")
if major_updates:
details_open = " open" if not first_details_created and not i else ""
change_entry += f"<details{details_open}>\n"
change_entry += " <summary>**Major Updates**</summary>\n\n"
change_entry += "\n".join(sorted(major_updates)) + "\n\n"
change_entry += "</details>\n\n"
if minor_updates:
details_open = " open" if not first_details_created and not i and not major_updates else ""
change_entry += f"<details{details_open}>\n"
change_entry += " <summary>**Minor Updates**</summary>\n\n"
change_entry += "\n".join(sorted(minor_updates)) + "\n\n"
change_entry += "</details>\n"
first_details_created = True
changes_by_year[year].append(change_entry)
print("✨ Finalizing changelog...")
summary = intro_text
for year in sorted(changes_by_year.keys(), reverse=True):
print(f"📅 Adding entries for {year}")
year_summary = f"## {year} Changes\n\n"
year_summary += "\n".join(changes_by_year[year])
summary += year_summary
summary += "\n---\n\n"
return summary.strip()
year = datetime.now().year
return "\n\n".join(sections) + "\n"
if __name__ == "__main__":
print("🗑️ Removing old changelog file if it exists...")
if os.path.exists(CHANGELOG_FILE):
try:
os.remove(CHANGELOG_FILE)
print(f"✅ Successfully removed old {CHANGELOG_FILE}")
except Exception as e:
print(f"❌ Error removing old changelog: {str(e)}")
raise SystemExit("Failed to remove old changelog file")
parser = argparse.ArgumentParser(description="Generate changelog for ML systems book.")
parser.add_argument("-i", "--incremental", action="store_true", help="Add new entries since last gh-pages publish (default).")
parser.add_argument("-f", "--full", action="store_true", help="Regenerate the entire changelog from scratch.")
parser.add_argument("-t", "--test", action="store_true", help="Run without writing to file.")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output.")
args = parser.parse_args()
mode = "incremental"
if args.full:
mode = "full"
print("📝 Generating new changelog...")
changelog = generate_changelog()
try:
with open(CHANGELOG_FILE, "w", encoding="utf-8") as f:
f.write(changelog + "\n")
print(f"✅ Changelog successfully generated in {CHANGELOG_FILE}")
load_chapter_order()
new_entry = generate_changelog(mode=mode, verbose=args.verbose) # returns just the `📅 Published on ...` block
if args.test:
print("🧪 TEST OUTPUT ONLY:\n")
print(new_entry)
else:
existing = ""
if os.path.exists(CHANGELOG_FILE):
with open(CHANGELOG_FILE, "r", encoding="utf-8") as f:
existing = f.read()
current_year = datetime.now().year
year_header = f"## {current_year} Changes"
# Remove first occurrence of the year header
existing_lines = existing.splitlines()
filtered_lines = []
found = False
for line in existing_lines:
if not found and line.strip() == year_header:
found = True
continue # skip this one line only
filtered_lines.append(line)
cleaned_existing = "\n".join(filtered_lines).strip()
# Prepend the new entry with correct year section
updated_content = f"{year_header}\n\n{new_entry.strip()}\n---\n\n{cleaned_existing}"
with open(CHANGELOG_FILE, "w", encoding="utf-8") as f:
f.write(updated_content.strip() + "\n")
print(f"\n✅ Changelog written to {CHANGELOG_FILE}")
except Exception as e:
print(f"❌ Error writing changelog: {str(e)}")
raise SystemExit("Failed to write new changelog file")
print(f"❌ Error: {e}")