mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-08 02:28:25 -05:00
Small bbl-validation helper for the interviews paper bibliography. Reads paper.bbl, extracts each bibitem's rough title, queries CrossRef, and prints [OK] / [WARN] / [ERR] per citation key. Useful as a spot-check after large bibliography edits to catch typos, wrong years, or silently- renamed works. Placed alongside the other paper-tooling (analyze_corpus.py, generate_figures.py, generate_macros.py). Path resolution uses Path(__file__).parent so it works from any CWD.
70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Validate bibliography entries in paper.bbl against CrossRef.
|
|
|
|
Reads paper.bbl from the paper directory, extracts each \\bibitem's rough
|
|
title, queries CrossRef, and reports whether a match was found. Useful as a
|
|
spot-check after large bibliography edits to catch typos, wrong years, or
|
|
silently-renamed works.
|
|
|
|
Run: python3 interviews/paper/scripts/validate_refs.py
|
|
Reads: ../paper.bbl (generated by LaTeX from references.bib)
|
|
Writes: stdout — [OK] / [WARN] / [ERR] per citation key
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
SCRIPTS_DIR = Path(__file__).parent
|
|
PAPER_DIR = SCRIPTS_DIR.parent
|
|
BBL_FILE = PAPER_DIR / "paper.bbl"
|
|
|
|
with open(BBL_FILE, "r") as f:
|
|
bbl = f.read()
|
|
|
|
# Extract bibitems
|
|
entries = re.split(r"\\bibitem\[.*?\]\{(.*?)\}", bbl)[1:]
|
|
keys = entries[0::2]
|
|
blocks = entries[1::2]
|
|
|
|
print(f"Checking {len(keys)} cited references...")
|
|
|
|
results = []
|
|
for i, (key, block) in enumerate(zip(keys, blocks)):
|
|
# Very rough extraction: grab text after \newblock up to a period
|
|
m = re.search(r"\\newblock (.*?)\.", block.replace("\n", " "))
|
|
title = m.group(1) if m else key
|
|
title = re.sub(r"\\[a-zA-Z]+", "", title)
|
|
title = re.sub(r"[\{\}\[\]\$\^~]", "", title)
|
|
title = title.strip()
|
|
|
|
if len(title) < 5:
|
|
title = key
|
|
|
|
try:
|
|
url = (
|
|
"https://api.crossref.org/works?query.title="
|
|
+ urllib.parse.quote(title)
|
|
+ "&select=title,author,URL&rows=1"
|
|
)
|
|
req = urllib.request.Request(
|
|
url, headers={"User-Agent": "mailto:vj@eecs.harvard.edu"}
|
|
)
|
|
with urllib.request.urlopen(req) as response:
|
|
data = json.loads(response.read().decode())
|
|
if data["message"]["items"]:
|
|
top_hit = data["message"]["items"][0]
|
|
crossref_title = top_hit.get("title", [""])[0]
|
|
results.append(f"[OK] {key}: Found '{crossref_title}'")
|
|
else:
|
|
results.append(f"[WARN] {key}: No CrossRef hit for '{title}'")
|
|
except Exception as e:
|
|
results.append(f"[ERR] {key}: {str(e)} for '{title}'")
|
|
time.sleep(0.1)
|
|
|
|
for r in results:
|
|
print(r)
|