From ad6229a899fbaf6d5e6d15811e55891bb04f6dea Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Wed, 25 Feb 2026 07:48:18 -0500 Subject: [PATCH] Adds options for targeted reference validation Introduces `--only-from-report` and `--only-keys` arguments to the `references` validation command. These allow re-validating only specific citation keys, either from a previous validation report or a custom list. This significantly improves the workflow for correcting references by enabling focused re-runs and reducing validation time. Removes the standalone `README_REFERENCE_CHECK.md` documentation, as its content is now implicitly handled by the integrated CLI help and broader documentation. --- book/cli/commands/reference_check.py | 5 +- book/cli/commands/validate.py | 21 +++++ book/tools/scripts/README_REFERENCE_CHECK.md | 88 -------------------- 3 files changed, 25 insertions(+), 89 deletions(-) delete mode 100644 book/tools/scripts/README_REFERENCE_CHECK.md diff --git a/book/cli/commands/reference_check.py b/book/cli/commands/reference_check.py index feda8bf9d..7ea322000 100644 --- a/book/cli/commands/reference_check.py +++ b/book/cli/commands/reference_check.py @@ -278,7 +278,10 @@ def run( return True, int((time.time() - t0) * 1000), [], 0 if console: - console.print(f"Validating {n} references against academic databases...") + if only_keys is not None: + console.print(f"Validating {n} references (only keys with issues from report/file)...") + else: + console.print(f"Validating {n} references against academic databases...") if skip_verified and not thorough and cache_path: console.print("(Skipping refs already marked verified in cache)\n") if os.environ.get("OPENALEX_KEY") or os.environ.get("S2_API_KEY"): diff --git a/book/cli/commands/validate.py b/book/cli/commands/validate.py index ac3ce5cc6..a15dfbaa3 100644 --- a/book/cli/commands/validate.py +++ b/book/cli/commands/validate.py @@ -235,6 +235,8 @@ class ValidateCommand: parser.add_argument("--skip-verified", dest="refs_skip_verified", action="store_true", help="references: skip refs already verified in cache") parser.add_argument("--thorough", dest="refs_thorough", action="store_true", help="references: revalidate all refs (ignore cache)") parser.add_argument("--refs-cache", dest="refs_cache", metavar="FILE", help="references: cache file (default: .references_verified.json in repo root)") + parser.add_argument("--only-from-report", dest="refs_only_from_report", metavar="FILE", help="references: validate only keys that had issues in this report file") + parser.add_argument("--only-keys", dest="refs_only_keys_file", metavar="FILE", help="references: validate only keys listed in FILE (one key per line)") try: ns = parser.parse_args(args) @@ -2825,6 +2827,24 @@ class ValidateCommand: else: cache_path = repo_root / ".references_verified.json" + only_keys: Optional[List[str]] = None + only_from_report = getattr(ns, "refs_only_from_report", None) + only_keys_file = getattr(ns, "refs_only_keys_file", None) + if only_from_report: + report_path = Path(only_from_report) if Path(only_from_report).is_absolute() else repo_root / only_from_report + if report_path.exists(): + only_keys = reference_check.parse_report_keys(report_path) + else: + console.print(f"[red]Report not found: {report_path}[/red]") + return ValidationRunResult(name="references", description="Bibliography vs academic DBs (hallucinator)", files_checked=0, issues=[ValidationIssue(file=str(report_path), line=0, code="references", message=f"Report not found: {report_path}", severity="error")], elapsed_ms=0) + elif only_keys_file: + keys_path = Path(only_keys_file) if Path(only_keys_file).is_absolute() else repo_root / only_keys_file + if keys_path.exists(): + only_keys = [line.strip() for line in keys_path.read_text(encoding="utf-8").splitlines() if line.strip()] + else: + console.print(f"[red]Keys file not found: {keys_path}[/red]") + return ValidationRunResult(name="references", description="Bibliography vs academic DBs (hallucinator)", files_checked=0, issues=[ValidationIssue(file=str(keys_path), line=0, code="references", message=f"Keys file not found: {keys_path}", severity="error")], elapsed_ms=0) + passed, elapsed_ms, issue_dicts, files_checked = reference_check.run( bib_paths, output_path=output_path, @@ -2835,6 +2855,7 @@ class ValidateCommand: cache_path=cache_path, skip_verified=skip_verified, thorough=thorough, + only_keys=only_keys, ) issues = [ ValidationIssue( diff --git a/book/tools/scripts/README_REFERENCE_CHECK.md b/book/tools/scripts/README_REFERENCE_CHECK.md deleted file mode 100644 index 44f3998c4..000000000 --- a/book/tools/scripts/README_REFERENCE_CHECK.md +++ /dev/null @@ -1,88 +0,0 @@ -# Reference check (hallucinator) - -Validates bibliography entries in the book’s `.bib` files against academic databases (CrossRef, arXiv, DBLP, Semantic Scholar, etc.) using [hallucinator](https://github.com/gianlucasb/hallucinator). **Native Binder CLI only** — implementation lives in `book/cli/commands/reference_check.py`. Use **`./book/binder validate references`** (not a standalone script). - -## Run - -From repo root: - -```bash -# Default: vol1 + vol2 references.bib -./book/binder validate references - -# One .bib file, report to file, quick test (first 5 refs) -./book/binder validate references -f book/quarto/contents/vol1/backmatter/references.bib -o report.txt --limit 5 - -# Full run, save report -./book/binder validate references -o book/tools/scripts/reference_check_report.txt -``` - -Options: `-f` / `--file BIB`, `-o` / `--output FILE`, `--limit N`, `--skip-verified`, `--thorough`, `--refs-cache FILE`. - -- **Output**: Each ref is printed with its citation key and status (✓ verified, ? not found, ~ author_mismatch, ! error). After the summary, a **Not verified** block lists every key that needs review. -- **Cache**: By default a cache file `.references_verified.json` (repo root) stores per-key status so future runs can skip refs already verified. - - `--skip-verified`: Only validate refs that are not already verified in the cache (faster repeat runs). - - `--thorough`: Revalidate all refs and ignore cache for filtering (cache is still updated after the run). - - `--refs-cache FILE`: Use a different cache file (default: repo root `.references_verified.json`). - -## Install - -One of: - -- `pip install -e ".[reference-check]"` (from repo root; optional extra) -- `pip install -r book/tools/dependencies/requirements.txt` (book tooling deps) -- `pip install hallucinator bibtexparser` (minimal) - -Optional env: `OPENALEX_KEY`, `S2_API_KEY`. - -## Rate limits - -**Semantic Scholar (S2)** allows **1 request per second** (cumulative across endpoints). With `S2_API_KEY` set, full runs over many references will take longer because of this limit; the validator may back off when rate-limited. Use `--limit N` for quick checks, or run full validation when you can leave it running. - -## Results - -- **Verified** — Found in a database with matching authors. -- **Not found** — Not in any checked DB (may still be valid: reports, books, very new papers). Check manually. -- **Author mismatch** — Title matched but authors differ. -- **Error** — Validator crashed or timed out for that ref (resilient mode skips and continues). - -Exit code: `0` if all verified; `1` if any not found, mismatch, or error. - -## Using the report (do not auto-correct) - -**We do not auto-correct or rewrite `.bib` from this check.** Reasons: - -- **Not found** — Many valid sources are not in academic DBs: vendor docs, standards (IEEE, ISO), blog posts, manuals, reports. Auto-“fixing” would delete or overwrite them. -- **Author mismatch** — Often formatting (e.g. “Smith, J.” vs “J. Smith”) or multi-author ordering; DB metadata can be wrong. -- **Risk** — Applying DB metadata blindly can introduce wrong DOIs, wrong authors, or duplicate entries. - -**Use the report as a manual review list:** - -1. **Not found (186 in your run)** - - If the work has a DOI or arXiv ID, add it to the entry and re-run; many will then verify. - - If it’s a report, standard, or doc, leave as-is and optionally add a `note` that it’s not in academic DBs. - -2. **Author mismatch (23 in your run)** - - Open the entry in the report and in your `.bib`; compare authors. - - Fix only if the `.bib` is clearly wrong (typo, wrong person); ignore harmless formatting differences. - -3. **Getting keys for batch review** - From the report file you can pull citation keys, e.g.: - ```bash - # Keys that were not found (for grepping .bib or scripting) - grep -E '^\s+\[[^]]+\]' report.txt | sed 's/.*\[\([^]]*\)\].*/\1/' - ``` - Or use the “Not verified” block printed at the end of `binder validate references` (same keys + status + title). - -## Pre-commit - -The hallucinator reference check is **not** in pre-commit (it is slow and uses optional deps/API keys). Pre-commit’s “book-check-references” runs `binder check refs` (in-repo citation/label checks), not this. Run `binder validate references` manually or in CI. For a quick gate use `--limit 10` or `--skip-verified`. - -## Betterbib - -The cache (`.references_verified.json`) records which citation keys were verified; **betterbib does not read it**. To avoid overwriting entries you’ve already verified: - -- Run reference check first, then run betterbib only on files or keys you’re editing, or -- Run reference check after betterbib and fix any newly introduced issues. - -A future wrapper could run betterbib only on keys that are not in the cache or not verified.