From ad6229a899fbaf6d5e6d15811e55891bb04f6dea Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Wed, 25 Feb 2026 07:48:18 -0500
Subject: [PATCH] Adds options for targeted reference validation

Introduces `--only-from-report` and `--only-keys` arguments to the `references` validation command.
These allow re-validating only specific citation keys, either from a previous validation report or a custom list.
This significantly improves the workflow for correcting references by enabling focused re-runs and reducing validation time.

Removes the standalone `README_REFERENCE_CHECK.md` documentation, as its content is now implicitly handled by the integrated CLI help and broader documentation.
---
 book/cli/commands/reference_check.py         |  5 +-
 book/cli/commands/validate.py                | 21 +++++
 book/tools/scripts/README_REFERENCE_CHECK.md | 88 --------------------
 3 files changed, 25 insertions(+), 89 deletions(-)
 delete mode 100644 book/tools/scripts/README_REFERENCE_CHECK.md

diff --git a/book/cli/commands/reference_check.py b/book/cli/commands/reference_check.py
index feda8bf9d..7ea322000 100644
--- a/book/cli/commands/reference_check.py
+++ b/book/cli/commands/reference_check.py
@@ -278,7 +278,10 @@ def run(
         return True, int((time.time() - t0) * 1000), [], 0
 
     if console:
-        console.print(f"Validating {n} references against academic databases...")
+        if only_keys is not None:
+            console.print(f"Validating {n} references (only keys with issues from report/file)...")
+        else:
+            console.print(f"Validating {n} references against academic databases...")
         if skip_verified and not thorough and cache_path:
             console.print("(Skipping refs already marked verified in cache)\n")
         if os.environ.get("OPENALEX_KEY") or os.environ.get("S2_API_KEY"):
diff --git a/book/cli/commands/validate.py b/book/cli/commands/validate.py
index ac3ce5cc6..a15dfbaa3 100644
--- a/book/cli/commands/validate.py
+++ b/book/cli/commands/validate.py
@@ -235,6 +235,8 @@ class ValidateCommand:
         parser.add_argument("--skip-verified", dest="refs_skip_verified", action="store_true", help="references: skip refs already verified in cache")
         parser.add_argument("--thorough", dest="refs_thorough", action="store_true", help="references: revalidate all refs (ignore cache)")
         parser.add_argument("--refs-cache", dest="refs_cache", metavar="FILE", help="references: cache file (default: .references_verified.json in repo root)")
+        parser.add_argument("--only-from-report", dest="refs_only_from_report", metavar="FILE", help="references: validate only keys that had issues in this report file")
+        parser.add_argument("--only-keys", dest="refs_only_keys_file", metavar="FILE", help="references: validate only keys listed in FILE (one key per line)")
 
         try:
             ns = parser.parse_args(args)
@@ -2825,6 +2827,24 @@ class ValidateCommand:
         else:
             cache_path = repo_root / ".references_verified.json"
 
+        only_keys: Optional[List[str]] = None
+        only_from_report = getattr(ns, "refs_only_from_report", None)
+        only_keys_file = getattr(ns, "refs_only_keys_file", None)
+        if only_from_report:
+            report_path = Path(only_from_report) if Path(only_from_report).is_absolute() else repo_root / only_from_report
+            if report_path.exists():
+                only_keys = reference_check.parse_report_keys(report_path)
+            else:
+                console.print(f"[red]Report not found: {report_path}[/red]")
+                return ValidationRunResult(name="references", description="Bibliography vs academic DBs (hallucinator)", files_checked=0, issues=[ValidationIssue(file=str(report_path), line=0, code="references", message=f"Report not found: {report_path}", severity="error")], elapsed_ms=0)
+        elif only_keys_file:
+            keys_path = Path(only_keys_file) if Path(only_keys_file).is_absolute() else repo_root / only_keys_file
+            if keys_path.exists():
+                only_keys = [line.strip() for line in keys_path.read_text(encoding="utf-8").splitlines() if line.strip()]
+            else:
+                console.print(f"[red]Keys file not found: {keys_path}[/red]")
+                return ValidationRunResult(name="references", description="Bibliography vs academic DBs (hallucinator)", files_checked=0, issues=[ValidationIssue(file=str(keys_path), line=0, code="references", message=f"Keys file not found: {keys_path}", severity="error")], elapsed_ms=0)
+
         passed, elapsed_ms, issue_dicts, files_checked = reference_check.run(
             bib_paths,
             output_path=output_path,
@@ -2835,6 +2855,7 @@ class ValidateCommand:
             cache_path=cache_path,
             skip_verified=skip_verified,
             thorough=thorough,
+            only_keys=only_keys,
         )
         issues = [
             ValidationIssue(
diff --git a/book/tools/scripts/README_REFERENCE_CHECK.md b/book/tools/scripts/README_REFERENCE_CHECK.md
deleted file mode 100644
index 44f3998c4..000000000
--- a/book/tools/scripts/README_REFERENCE_CHECK.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# Reference check (hallucinator)
-
-Validates bibliography entries in the book’s `.bib` files against academic databases (CrossRef, arXiv, DBLP, Semantic Scholar, etc.) using [hallucinator](https://github.com/gianlucasb/hallucinator). **Native Binder CLI only** — implementation lives in `book/cli/commands/reference_check.py`. Use **`./book/binder validate references`** (not a standalone script).
-
-## Run
-
-From repo root:
-
-```bash
-# Default: vol1 + vol2 references.bib
-./book/binder validate references
-
-# One .bib file, report to file, quick test (first 5 refs)
-./book/binder validate references -f book/quarto/contents/vol1/backmatter/references.bib -o report.txt --limit 5
-
-# Full run, save report
-./book/binder validate references -o book/tools/scripts/reference_check_report.txt
-```
-
-Options: `-f` / `--file BIB`, `-o` / `--output FILE`, `--limit N`, `--skip-verified`, `--thorough`, `--refs-cache FILE`.
-
-- **Output**: Each ref is printed with its citation key and status (✓ verified, ? not found, ~ author_mismatch, ! error). After the summary, a **Not verified** block lists every key that needs review.
-- **Cache**: By default a cache file `.references_verified.json` (repo root) stores per-key status so future runs can skip refs already verified.
-  - `--skip-verified`: Only validate refs that are not already verified in the cache (faster repeat runs).
-  - `--thorough`: Revalidate all refs and ignore cache for filtering (cache is still updated after the run).
-  - `--refs-cache FILE`: Use a different cache file (default: repo root `.references_verified.json`).
-
-## Install
-
-One of:
-
-- `pip install -e ".[reference-check]"` (from repo root; optional extra)
-- `pip install -r book/tools/dependencies/requirements.txt` (book tooling deps)
-- `pip install hallucinator bibtexparser` (minimal)
-
-Optional env: `OPENALEX_KEY`, `S2_API_KEY`.
-
-## Rate limits
-
-**Semantic Scholar (S2)** allows **1 request per second** (cumulative across endpoints). With `S2_API_KEY` set, full runs over many references will take longer because of this limit; the validator may back off when rate-limited. Use `--limit N` for quick checks, or run full validation when you can leave it running.
-
-## Results
-
-- **Verified** — Found in a database with matching authors.
-- **Not found** — Not in any checked DB (may still be valid: reports, books, very new papers). Check manually.
-- **Author mismatch** — Title matched but authors differ.
-- **Error** — Validator crashed or timed out for that ref (resilient mode skips and continues).
-
-Exit code: `0` if all verified; `1` if any not found, mismatch, or error.
-
-## Using the report (do not auto-correct)
-
-**We do not auto-correct or rewrite `.bib` from this check.** Reasons:
-
-- **Not found** — Many valid sources are not in academic DBs: vendor docs, standards (IEEE, ISO), blog posts, manuals, reports. Auto-“fixing” would delete or overwrite them.
-- **Author mismatch** — Often formatting (e.g. “Smith, J.” vs “J. Smith”) or multi-author ordering; DB metadata can be wrong.
-- **Risk** — Applying DB metadata blindly can introduce wrong DOIs, wrong authors, or duplicate entries.
-
-**Use the report as a manual review list:**
-
-1. **Not found (186 in your run)**  
-   - If the work has a DOI or arXiv ID, add it to the entry and re-run; many will then verify.  
-   - If it’s a report, standard, or doc, leave as-is and optionally add a `note` that it’s not in academic DBs.
-
-2. **Author mismatch (23 in your run)**  
-   - Open the entry in the report and in your `.bib`; compare authors.  
-   - Fix only if the `.bib` is clearly wrong (typo, wrong person); ignore harmless formatting differences.
-
-3. **Getting keys for batch review**  
-   From the report file you can pull citation keys, e.g.:
-   ```bash
-   # Keys that were not found (for grepping .bib or scripting)
-   grep -E '^\s+\[[^]]+\]' report.txt | sed 's/.*\[\([^]]*\)\].*/\1/'
-   ```
-   Or use the “Not verified” block printed at the end of `binder validate references` (same keys + status + title).
-
-## Pre-commit
-
-The hallucinator reference check is **not** in pre-commit (it is slow and uses optional deps/API keys). Pre-commit’s “book-check-references” runs `binder check refs` (in-repo citation/label checks), not this. Run `binder validate references` manually or in CI. For a quick gate use `--limit 10` or `--skip-verified`.
-
-## Betterbib
-
-The cache (`.references_verified.json`) records which citation keys were verified; **betterbib does not read it**. To avoid overwriting entries you’ve already verified:
-
-- Run reference check first, then run betterbib only on files or keys you’re editing, or
-- Run reference check after betterbib and fix any newly introduced issues.
-
-A future wrapper could run betterbib only on keys that are not in the cache or not verified.