cs249r_book/interviews/paper/scripts/validate_refs.py

#!/usr/bin/env python3
"""Validate bibliography entries in paper.bbl against CrossRef.

Reads paper.bbl from the paper directory, extracts each \\bibitem's rough
title, queries CrossRef, and reports whether a match was found. Useful as a
spot-check after large bibliography edits to catch typos, wrong years, or
silently-renamed works.

Run: python3 interviews/paper/scripts/validate_refs.py
Reads: ../paper.bbl (generated by LaTeX from references.bib)
Writes: stdout — [OK] / [WARN] / [ERR] per citation key
"""

import json
import re
import time
import urllib.parse
import urllib.request
from pathlib import Path

SCRIPTS_DIR = Path(__file__).parent
PAPER_DIR = SCRIPTS_DIR.parent
BBL_FILE = PAPER_DIR / "paper.bbl"

with open(BBL_FILE, "r") as f:
    bbl = f.read()

# Extract bibitems
entries = re.split(r"\\bibitem\[.*?\]\{(.*?)\}", bbl)[1:]
keys = entries[0::2]
blocks = entries[1::2]

print(f"Checking {len(keys)} cited references...")

results = []
for i, (key, block) in enumerate(zip(keys, blocks)):
    # Very rough extraction: grab text after \newblock up to a period
    m = re.search(r"\\newblock (.*?)\.", block.replace("\n", " "))
    title = m.group(1) if m else key
    title = re.sub(r"\\[a-zA-Z]+", "", title)
    title = re.sub(r"[\{\}\[\]\$\^~]", "", title)
    title = title.strip()

    if len(title) < 5:
        title = key

    try:
        url = (
            "https://api.crossref.org/works?query.title="
            + urllib.parse.quote(title)
            + "&select=title,author,URL&rows=1"
        )
        req = urllib.request.Request(
            url, headers={"User-Agent": "mailto:vj@eecs.harvard.edu"}
        )
        with urllib.request.urlopen(req) as response:
            data = json.loads(response.read().decode())
            if data["message"]["items"]:
                top_hit = data["message"]["items"][0]
                crossref_title = top_hit.get("title", [""])[0]
                results.append(f"[OK] {key}: Found '{crossref_title}'")
            else:
                results.append(f"[WARN] {key}: No CrossRef hit for '{title}'")
    except Exception as e:
        results.append(f"[ERR] {key}: {str(e)} for '{title}'")
    time.sleep(0.1)

for r in results:
    print(r)