Files
cs249r_book/interviews/paper/scripts/validate_refs.py
Vijay Janapa Reddi 9b313c17d9 feat(paper/scripts): add validate_refs.py — CrossRef spot-check for paper.bbl
Small bbl-validation helper for the interviews paper bibliography. Reads
paper.bbl, extracts each bibitem's rough title, queries CrossRef, and
prints [OK] / [WARN] / [ERR] per citation key. Useful as a spot-check
after large bibliography edits to catch typos, wrong years, or silently-
renamed works.

Placed alongside the other paper-tooling (analyze_corpus.py,
generate_figures.py, generate_macros.py). Path resolution uses
Path(__file__).parent so it works from any CWD.
2026-04-24 11:26:07 -04:00

70 lines
2.2 KiB
Python

#!/usr/bin/env python3
"""Validate bibliography entries in paper.bbl against CrossRef.
Reads paper.bbl from the paper directory, extracts each \\bibitem's rough
title, queries CrossRef, and reports whether a match was found. Useful as a
spot-check after large bibliography edits to catch typos, wrong years, or
silently-renamed works.
Run: python3 interviews/paper/scripts/validate_refs.py
Reads: ../paper.bbl (generated by LaTeX from references.bib)
Writes: stdout — [OK] / [WARN] / [ERR] per citation key
"""
import json
import re
import time
import urllib.parse
import urllib.request
from pathlib import Path
SCRIPTS_DIR = Path(__file__).parent
PAPER_DIR = SCRIPTS_DIR.parent
BBL_FILE = PAPER_DIR / "paper.bbl"
with open(BBL_FILE, "r") as f:
bbl = f.read()
# Extract bibitems
entries = re.split(r"\\bibitem\[.*?\]\{(.*?)\}", bbl)[1:]
keys = entries[0::2]
blocks = entries[1::2]
print(f"Checking {len(keys)} cited references...")
results = []
for i, (key, block) in enumerate(zip(keys, blocks)):
# Very rough extraction: grab text after \newblock up to a period
m = re.search(r"\\newblock (.*?)\.", block.replace("\n", " "))
title = m.group(1) if m else key
title = re.sub(r"\\[a-zA-Z]+", "", title)
title = re.sub(r"[\{\}\[\]\$\^~]", "", title)
title = title.strip()
if len(title) < 5:
title = key
try:
url = (
"https://api.crossref.org/works?query.title="
+ urllib.parse.quote(title)
+ "&select=title,author,URL&rows=1"
)
req = urllib.request.Request(
url, headers={"User-Agent": "mailto:vj@eecs.harvard.edu"}
)
with urllib.request.urlopen(req) as response:
data = json.loads(response.read().decode())
if data["message"]["items"]:
top_hit = data["message"]["items"][0]
crossref_title = top_hit.get("title", [""])[0]
results.append(f"[OK] {key}: Found '{crossref_title}'")
else:
results.append(f"[WARN] {key}: No CrossRef hit for '{title}'")
except Exception as e:
results.append(f"[ERR] {key}: {str(e)} for '{title}'")
time.sleep(0.1)
for r in results:
print(r)