cs249r_book/book/cli/commands/reference_check.py

"""
Native reference check: validate .bib entries against academic DBs (hallucinator).

Used by `binder validate references`. Requires: pip install hallucinator bibtexparser
(or install optional extra: pip install -e ".[reference-check]").

Optional env: OPENALEX_KEY, S2_API_KEY. Note: Semantic Scholar allows 1 request/sec;
full runs with S2_API_KEY set will be slow; use --limit for quick tests.
"""

from __future__ import annotations

import json
import os
import re
import subprocess
import sys
import time
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import quote_plus
from types import SimpleNamespace
from typing import Any, Dict, List, Optional, Tuple

# Optional deps; fail at run time with clear message if missing
try:
    import bibtexparser
except ImportError:
    bibtexparser = None  # type: ignore[assignment]

try:
    from hallucinator import Reference, Validator, ValidatorConfig
except ImportError:
    Reference = Validator = ValidatorConfig = None  # type: ignore[assignment,misc]

MIN_TITLE_WORDS = 4

# Default .bib paths relative to repo root (when book_dir.parent.parent is repo root)
DEFAULT_BIB_REL_PATHS = [
    "book/quarto/contents/vol1/backmatter/references.bib",
    "book/quarto/contents/vol2/backmatter/references.bib",
]

_CHILD_SCRIPT = r"""
import json, os, sys
from hallucinator import Reference, Validator, ValidatorConfig
ref_dict = json.loads(sys.argv[1])
ref = Reference(
    ref_dict["title"],
    authors=ref_dict.get("authors") or [],
    doi=ref_dict.get("doi"),
    arxiv_id=ref_dict.get("arxiv_id"),
)
config = ValidatorConfig()
if os.environ.get("OPENALEX_KEY"):
    config.openalex_key = os.environ["OPENALEX_KEY"]
if os.environ.get("S2_API_KEY"):
    config.s2_api_key = os.environ["S2_API_KEY"]
validator = Validator(config)
results = validator.check([ref])
r = results[0]
print(r.status, r.source or "", r.title, sep="\t")
"""


def _to_ascii(s: str) -> str:
    if not s:
        return s
    n = unicodedata.normalize("NFKD", s)
    return n.encode("ascii", "ignore").decode("ascii")


def _normalize_title(raw: str) -> str:
    if not raw:
        return ""
    t = re.sub(r"[\{\}]", "", raw)
    t = re.sub(r"\s+", " ", t).strip()
    return t


def _parse_authors(author_field: str) -> List[str]:
    if not author_field or not author_field.strip():
        return []
    authors = []
    for part in re.split(r"\s+and\s+", author_field, flags=re.IGNORECASE):
        part = part.strip()
        if not part:
            continue
        if "," in part:
            family = part.split(",", 1)[0].strip()
        else:
            family = part
        family = re.sub(r"\\[a-z]+\{([^}]*)\}", r"\1", family)
        family = re.sub(r"[{}\\]", "", family).strip()
        family = _to_ascii(family)
        if family:
            authors.append(family)
    return authors[:15]


def _extract_arxiv_id(entry: dict) -> Optional[str]:
    ap = (entry.get("archiveprefix") or "").strip().lower()
    eprint = (entry.get("eprint") or "").strip()
    if ap == "arxiv" and eprint:
        return eprint
    url = entry.get("url") or ""
    m = re.search(r"arxiv\.org/abs/(\d+\.\d+v?\d*)", url, re.IGNORECASE)
    if m:
        return m.group(1)
    return None


def _bib_entries_to_references(bib_path: Path) -> List[Tuple[str, Any]]:
    with open(bib_path, encoding="utf-8", errors="replace") as f:
        bib_str = f.read()
    parser = bibtexparser.bparser.BibTexParser(common_strings=True)
    parser.ignore_nonstandard_types = False
    db = bibtexparser.loads(bib_str, parser)
    out = []
    for entry in db.entries:
        key = entry.get("ID", "")
        title = _normalize_title(entry.get("title", ""))
        if not title or len(title.split()) < MIN_TITLE_WORDS:
            continue
        title = _to_ascii(title)
        authors = _parse_authors(entry.get("author", ""))
        doi = (entry.get("doi") or "").strip() or None
        arxiv_id = _extract_arxiv_id(entry)
        ref = Reference(title=title, authors=authors, doi=doi, arxiv_id=arxiv_id)
        out.append((key, ref))
    return out


def _dedupe_refs(items: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]:
    seen: set = set()
    out = []
    for key, ref in items:
        sig = (ref.title, ref.doi, ref.arxiv_id)
        if sig in seen:
            continue
        seen.add(sig)
        out.append((key, ref))
    return out


def _validate_resilient(keys: List[str], refs: List[Any], console: Optional[Any]) -> List[Any]:
    results = []
    n = len(refs)
    key_w = min(36, max(12, max(len(k) for k in keys) if keys else 12))
    for i, (key, ref) in enumerate(zip(keys, refs)):
        payload = {
            "title": ref.title,
            "authors": ref.authors,
            "doi": ref.doi,
            "arxiv_id": ref.arxiv_id,
        }
        try:
            proc = subprocess.run(
                [sys.executable, "-c", _CHILD_SCRIPT, json.dumps(payload)],
                capture_output=True,
                text=True,
                timeout=90,
                env=os.environ,
            )
        except subprocess.TimeoutExpired:
            results.append(SimpleNamespace(status="error", title=ref.title, source="timeout"))
            if console:
                console.print(f"  [{i+1:>{len(str(n))}}/{n}]  {key[:key_w]:<{key_w}}  ! error (timeout)")
            continue
        if proc.returncode != 0 or not proc.stdout.strip():
            results.append(SimpleNamespace(status="error", title=ref.title, source="validator crash"))
            if console:
                console.print(f"  [{i+1:>{len(str(n))}}/{n}]  {key[:key_w]:<{key_w}}  ! error (crash)")
            continue
        parts = proc.stdout.strip().split("\t", 2)
        status = parts[0] if parts else "error"
        source = (parts[1] or None) if len(parts) > 1 else None
        title_out = parts[2] if len(parts) > 2 else ref.title
        results.append(SimpleNamespace(status=status, title=title_out, source=source))
        if console:
            icon = {"verified": "\u2713", "not_found": "?", "author_mismatch": "~"}.get(status, "!")
            src = f" ({source})" if source else ""
            if status == "verified":
                console.print(f"  [{i+1:>{len(str(n))}}/{n}]  {key[:key_w]:<{key_w}}  {icon} verified{src}")
            else:
                console.print(f"  [{i+1:>{len(str(n))}}/{n}]  {key[:key_w]:<{key_w}}  {icon} {status}{src}")
    return results


def _load_cache(cache_path: Path) -> Dict[str, dict]:
    if not cache_path.exists():
        return {}
    try:
        with open(cache_path, encoding="utf-8") as f:
            return json.load(f)
    except (json.JSONDecodeError, OSError):
        return {}


def _save_cache(cache_path: Path, updates: Dict[str, dict]) -> None:
    existing = _load_cache(cache_path)
    existing.update(updates)
    cache_path.parent.mkdir(parents=True, exist_ok=True)
    with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(existing, f, indent=2)


def parse_report_keys(report_path: Path) -> List[str]:
    """
    Parse a reference-check report and return citation keys from
    "Not found" and "Author mismatch" sections (for re-validating only those).
    """
    keys: List[str] = []
    pattern = re.compile(r"^\s+\[([^\]]+)\]\s")
    try:
        with open(report_path, encoding="utf-8") as f:
            for line in f:
                m = pattern.match(line)
                if m:
                    keys.append(m.group(1))
    except OSError:
        pass
    return keys


def run(
    bib_paths: List[Path],
    *,
    output_path: Optional[Path] = None,
    limit: Optional[int] = None,
    dedupe: bool = True,
    resilient: bool = True,
    console: Optional[Any] = None,
    cache_path: Optional[Path] = None,
    skip_verified: bool = False,
    thorough: bool = False,
    only_keys: Optional[List[str]] = None,
) -> Tuple[bool, int, List[dict], int]:
    """
    Load .bib files, validate refs against academic DBs, optionally write report.

    cache_path: if set, read/write verification cache (key -> {status, source, date}).
    skip_verified: only validate refs not already verified in cache (ignored if thorough).
    thorough: revalidate all refs and ignore cache for filtering.
    only_keys: if set, validate only these citation keys (e.g. from a previous report).

    Returns:
        (passed, elapsed_ms, issues, ref_count)
        issues: list of dicts with file, line, code, message, severity for ValidationIssue.
    """
    if bibtexparser is None or Reference is None or Validator is None:
        return (
            False,
            0,
            [
                {
                    "file": "(reference_check)",
                    "line": 0,
                    "code": "references",
                    "message": "Missing deps: pip install hallucinator bibtexparser (or pip install -e \".[reference-check]\")",
                    "severity": "error",
                }
            ],
            0,
        )

    t0 = time.time()
    all_refs: List[Tuple[str, Any]] = []
    for p in bib_paths:
        if not p.exists():
            return (
                False,
                int((time.time() - t0) * 1000),
                [{"file": str(p), "line": 0, "code": "references", "message": f"Not found: {p}", "severity": "error"}],
                0,
            )
        all_refs.extend(_bib_entries_to_references(p))

    if not all_refs:
        return True, int((time.time() - t0) * 1000), [], 0

    if dedupe:
        all_refs = _dedupe_refs(all_refs)

    # Restrict to a subset of keys (e.g. from --only-from-report or --only-keys) before limit
    if only_keys is not None:
        allowed = set(only_keys)
        all_refs = [(k, r) for k, r in all_refs if k in allowed]
        if not all_refs and console:
            console.print(f"[yellow]No .bib entries matched the {len(allowed)} key(s) to check.[/yellow]")
        if not all_refs:
            return True, int((time.time() - t0) * 1000), [], 0

    if limit is not None:
        all_refs = all_refs[:limit]

    # Optionally skip refs already verified in cache
    if skip_verified and not thorough and cache_path:
        cache = _load_cache(cache_path)
        all_refs = [(k, r) for k, r in all_refs if cache.get(k, {}).get("status") != "verified"]

    refs = [r for _, r in all_refs]
    keys = [k for k, _ in all_refs]
    n = len(refs)

    if n == 0:
        # All refs were skipped as already verified
        return True, int((time.time() - t0) * 1000), [], 0

    if console:
        if only_keys is not None:
            console.print(f"Validating {n} references (only keys with issues from report/file)...")
        else:
            console.print(f"Validating {n} references against academic databases...")
        if skip_verified and not thorough and cache_path:
            console.print("(Skipping refs already marked verified in cache)\n")
        if os.environ.get("OPENALEX_KEY") or os.environ.get("S2_API_KEY"):
            console.print("(Using OPENALEX_KEY / S2_API_KEY for better coverage)\n")
        else:
            console.print("(Optional: OPENALEX_KEY, S2_API_KEY for better coverage)\n")

    if resilient:
        results = _validate_resilient(keys, refs, console)
    else:
        config = ValidatorConfig()
        if os.environ.get("OPENALEX_KEY"):
            config.openalex_key = os.environ["OPENALEX_KEY"]
        if os.environ.get("S2_API_KEY"):
            config.s2_api_key = os.environ["S2_API_KEY"]
        validator = Validator(config)
        key_w = min(36, max(12, max(len(k) for k in keys) if keys else 12))

        def progress(event: Any) -> None:
            if event.event_type == "result" and console:
                r = event.result
                idx = event.index + 1
                key = keys[event.index] if event.index < len(keys) else ""
                icon = {"verified": "\u2713", "not_found": "?", "author_mismatch": "~"}.get(r.status, "!")
                src = f" ({r.source})" if r.source else ""
                if r.status == "verified":
                    console.print(f"  [{idx:>{len(str(n))}}/{n}]  {key[:key_w]:<{key_w}}  {icon} verified{src}")
                else:
                    console.print(f"  [{idx:>{len(str(n))}}/{n}]  {key[:key_w]:<{key_w}}  {icon} {r.status}{src}")

        results = validator.check(refs, progress=progress)

    elapsed_ms = int((time.time() - t0) * 1000)
    verified = sum(1 for r in results if r.status == "verified")
    not_found = sum(1 for r in results if r.status == "not_found")
    mismatch = sum(1 for r in results if r.status == "author_mismatch")
    errors = sum(1 for r in results if r.status == "error")
    passed = (not_found == 0 and mismatch == 0 and errors == 0)

    issues: List[dict] = []
    for key, r in zip(keys, results):
        if r.status == "not_found":
            issues.append({"file": key, "line": 0, "code": "references", "message": r.title, "severity": "error"})
        elif r.status == "author_mismatch":
            issues.append({"file": key, "line": 0, "code": "references", "message": f"author_mismatch: {r.title}", "severity": "error"})
        elif r.status == "error":
            issues.append({"file": key, "line": 0, "code": "references", "message": f"error: {r.title}", "severity": "error"})

    if console:
        console.print("")
        console.print("Summary")
        console.print("-------")
        console.print(f"  Verified:        {verified}")
        console.print(f"  Not found:       {not_found}")
        console.print(f"  Author mismatch: {mismatch}")
        if errors:
            console.print(f"  Error (skipped): {errors}")
        console.print(f"  Total:           {n}")
        not_verified = [(k, r) for k, r in zip(keys, results) if r.status in ("not_found", "author_mismatch", "error")]
        if not_verified:
            console.print("")
            console.print("Not verified (review these)")
            console.print("---------------------------")
            for key, r in not_verified:
                title = (r.title or "")[:72] + ("..." if len((r.title or "")) > 72 else "")
                console.print(f"  [{key}]  {r.status}: {title}")

    if output_path is not None:
        output_path = Path(output_path)
        with open(output_path, "w", encoding="utf-8") as f:
            f.write("Hallucinator reference check report\n")
            f.write("====================================\n\n")
            f.write(f"Sources: {[str(p) for p in bib_paths]}\n\n")
            f.write(f"Verified: {verified}, Not found: {not_found}, Author mismatch: {mismatch}, Error: {errors}, Total: {n}\n\n")
            f.write("Not found (potential typos or non-indexed):\n")
            for key, r in zip(keys, results):
                if r.status == "not_found":
                    f.write(f"  [{key}] {r.title}\n")
            f.write("\nAuthor mismatch:\n")
            for key, r in zip(keys, results):
                if r.status == "author_mismatch":
                    f.write(f"  [{key}] {r.title}\n")
            err_list = [(k, r) for k, r in zip(keys, results) if r.status == "error"]
            if err_list:
                f.write("\nError (validator crash or timeout):\n")
                for key, r in err_list:
                    f.write(f"  [{key}] {r.title}\n")
        # Companion file: one search link per not_verified entry (search online to fix .bib)
        not_verified = [(k, r) for k, r in zip(keys, results) if r.status in ("not_found", "author_mismatch", "error")]
        if not_verified:
            search_links_path = output_path.with_suffix(output_path.suffix + ".search-links.txt")
            with open(search_links_path, "w", encoding="utf-8") as sl:
                sl.write("# key\ttitle\tGoogle Scholar search URL (search online, then update .bib with DOI/arXiv or authors)\n")
                for key, r in zip(keys, results):
                    if r.status in ("not_found", "author_mismatch", "error"):
                        title = (r.title or "").strip()
                        q = quote_plus(title)
                        url = f"https://scholar.google.com/scholar?q={q}"
                        sl.write(f"{key}\t{title}\t{url}\n")
            if console:
                console.print(f"Search links written to {search_links_path} (open each URL to search online, then update .bib)")
        if console:
            console.print(f"\nReport written to {output_path}")

    if cache_path is not None:
        now = datetime.now(timezone.utc).isoformat()
        updates = {
            key: {"status": r.status, "source": r.source or "", "date": now}
            for key, r in zip(keys, results)
        }
        _save_cache(Path(cache_path), updates)

    return passed, elapsed_ms, issues, n