mirror of
https://github.com/KohakuBlueleaf/KohakuHub.git
synced 2026-05-06 12:27:43 -05:00
verify_seed_data.py hardcoded EXPECTED_SEED_VERSION = "local-dev-demo-v3" but seed_demo_data.py was bumped to v4 in the preview PR, so the post-seed verifier would falsely fail with a version mismatch. Extract the constant to scripts/dev/seed_shared.py and import it from both sides so the two scripts always agree. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
236 lines
7.8 KiB
Python
236 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Verify local demo seed fixtures remain navigable and downloadable."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
from seed_shared import SEED_VERSION
|
|
|
|
ROOT_DIR = Path(__file__).resolve().parents[2]
|
|
SRC_DIR = ROOT_DIR / "src"
|
|
if str(SRC_DIR) not in sys.path:
|
|
sys.path.insert(0, str(SRC_DIR))
|
|
|
|
from kohakuhub.config import cfg
|
|
from kohakuhub.main import app
|
|
from kohakuhub.utils.s3 import init_storage
|
|
|
|
MANIFEST_PATH = ROOT_DIR / "hub-meta" / "dev" / "demo-seed-manifest.json"
|
|
INTERNAL_BASE_URL = (
|
|
getattr(cfg.app, "internal_base_url", None)
|
|
or cfg.app.base_url
|
|
or "http://127.0.0.1:48888"
|
|
)
|
|
|
|
|
|
class VerifyError(RuntimeError):
|
|
"""Raised when local demo seed verification fails."""
|
|
|
|
|
|
def load_manifest() -> dict:
|
|
if not MANIFEST_PATH.exists():
|
|
raise VerifyError(
|
|
"Missing demo seed manifest. Run `make seed-demo` before verification."
|
|
)
|
|
|
|
manifest = json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
|
|
if manifest.get("seed_version") != SEED_VERSION:
|
|
raise VerifyError(
|
|
f"Expected seed version {SEED_VERSION}, "
|
|
f"got {manifest.get('seed_version')!r}."
|
|
)
|
|
return manifest
|
|
|
|
|
|
def require_path(entries: list[dict], expected_path: str) -> dict:
|
|
for entry in entries:
|
|
if entry.get("path") == expected_path:
|
|
return entry
|
|
available = ", ".join(sorted(entry.get("path", "<missing>") for entry in entries))
|
|
raise VerifyError(
|
|
f"Expected tree entry {expected_path!r} but only found: {available}"
|
|
)
|
|
|
|
|
|
async def get_json(client: httpx.AsyncClient, path: str):
|
|
response = await client.get(path)
|
|
if response.status_code != 200:
|
|
raise VerifyError(f"GET {path} returned {response.status_code}: {response.text}")
|
|
return response.json()
|
|
|
|
|
|
async def head_ok(client: httpx.AsyncClient, path: str) -> httpx.Response:
|
|
response = await client.head(path)
|
|
if response.status_code != 200:
|
|
raise VerifyError(
|
|
f"HEAD {path} returned {response.status_code}: {response.text}"
|
|
)
|
|
return response
|
|
|
|
|
|
async def resolve_json_via_download(
|
|
client: httpx.AsyncClient,
|
|
path: str,
|
|
) -> dict:
|
|
response = await client.get(path, follow_redirects=False)
|
|
if response.status_code != 302:
|
|
raise VerifyError(f"GET {path} returned {response.status_code}: {response.text}")
|
|
|
|
location = response.headers.get("location")
|
|
if not location:
|
|
raise VerifyError(f"GET {path} did not return a download location.")
|
|
|
|
async with httpx.AsyncClient(timeout=60.0) as download_client:
|
|
download_response = await download_client.get(location)
|
|
|
|
if download_response.status_code != 200:
|
|
raise VerifyError(
|
|
f"Downloading {path} from presigned URL returned "
|
|
f"{download_response.status_code}."
|
|
)
|
|
|
|
try:
|
|
return download_response.json()
|
|
except json.JSONDecodeError as exc:
|
|
raise VerifyError(f"Downloaded {path} was not valid JSON: {exc}") from exc
|
|
|
|
|
|
async def verify_seed_data() -> dict:
|
|
manifest = load_manifest()
|
|
init_storage()
|
|
transport = httpx.ASGITransport(app=app)
|
|
|
|
summary = {
|
|
"seed_version": manifest["seed_version"],
|
|
"verified_checks": [],
|
|
}
|
|
|
|
async with httpx.AsyncClient(
|
|
transport=transport,
|
|
base_url=INTERNAL_BASE_URL,
|
|
follow_redirects=False,
|
|
timeout=60.0,
|
|
) as client:
|
|
table_root = await get_json(
|
|
client, "/api/datasets/open-media-lab/table-scan-fixtures/tree/main"
|
|
)
|
|
require_path(table_root, "metadata")
|
|
summary["verified_checks"].append("table-scan-fixtures root tree")
|
|
|
|
metadata_entries = await get_json(
|
|
client,
|
|
"/api/datasets/open-media-lab/table-scan-fixtures/tree/main/metadata",
|
|
)
|
|
require_path(metadata_entries, "metadata/features.json")
|
|
summary["verified_checks"].append("table-scan-fixtures metadata tree")
|
|
|
|
metadata_head = await head_ok(
|
|
client,
|
|
"/datasets/open-media-lab/table-scan-fixtures/resolve/main/metadata/features.json",
|
|
)
|
|
metadata_size = int(metadata_head.headers.get("content-length") or "0")
|
|
if metadata_size <= 0:
|
|
raise VerifyError(
|
|
"metadata/features.json returned a non-positive content length."
|
|
)
|
|
summary["verified_checks"].append("table-scan-fixtures metadata HEAD")
|
|
|
|
metadata_json = await resolve_json_via_download(
|
|
client,
|
|
"/datasets/open-media-lab/table-scan-fixtures/resolve/main/metadata/features.json",
|
|
)
|
|
if metadata_json != {"id": "string", "label": "string"}:
|
|
raise VerifyError(
|
|
"metadata/features.json returned unexpected content: "
|
|
f"{metadata_json!r}"
|
|
)
|
|
summary["verified_checks"].append("table-scan-fixtures metadata download")
|
|
|
|
hierarchy_root = await get_json(
|
|
client,
|
|
"/api/datasets/open-media-lab/hierarchy-crawl-fixtures/tree/main/catalog",
|
|
)
|
|
require_path(hierarchy_root, "catalog/section-01")
|
|
if any(
|
|
entry.get("path", "").startswith("catalog/catalog/")
|
|
for entry in hierarchy_root
|
|
):
|
|
raise VerifyError(
|
|
"Hierarchy tree unexpectedly contains duplicated catalog prefixes."
|
|
)
|
|
summary["verified_checks"].append("hierarchy-crawl root tree")
|
|
|
|
section_entries = await get_json(
|
|
client,
|
|
"/api/datasets/open-media-lab/hierarchy-crawl-fixtures/tree/main/catalog/section-01",
|
|
)
|
|
require_path(section_entries, "catalog/section-01/tier-01")
|
|
summary["verified_checks"].append("hierarchy-crawl section tree")
|
|
|
|
deep_json = await resolve_json_via_download(
|
|
client,
|
|
"/datasets/open-media-lab/hierarchy-crawl-fixtures/resolve/main/"
|
|
"catalog/section-01/tier-01/branch-01/node-01-01-01/entry-01-01-01.json",
|
|
)
|
|
deep_json_path = (
|
|
"catalog/section-01/tier-01/branch-01/"
|
|
"node-01-01-01/entry-01-01-01.json"
|
|
)
|
|
expected_deep_json = {
|
|
"checksum": hashlib.sha256(deep_json_path.encode("utf-8")).hexdigest(),
|
|
"fixture": "hierarchy-crawl",
|
|
"leaf": 1,
|
|
"section": 1,
|
|
"shard": 1,
|
|
}
|
|
if deep_json != expected_deep_json:
|
|
raise VerifyError(
|
|
"Deep hierarchy JSON returned unexpected content: "
|
|
f"{deep_json!r}"
|
|
)
|
|
summary["verified_checks"].append("hierarchy-crawl deep JSON download")
|
|
|
|
expected_fallback_sources = {
|
|
(source["url"].rstrip("/"), source["name"])
|
|
for source in manifest.get("fallback_sources", [])
|
|
if source.get("namespace", "") == ""
|
|
}
|
|
if expected_fallback_sources:
|
|
available_sources = await get_json(
|
|
client, "/api/fallback-sources/available"
|
|
)
|
|
available = {
|
|
(entry.get("url", "").rstrip("/"), entry.get("name"))
|
|
for entry in available_sources
|
|
}
|
|
missing = expected_fallback_sources - available
|
|
if missing:
|
|
raise VerifyError(
|
|
"Missing seeded fallback sources: "
|
|
+ ", ".join(sorted(f"{name} ({url})" for url, name in missing))
|
|
)
|
|
summary["verified_checks"].append("fallback sources available")
|
|
|
|
return summary
|
|
|
|
|
|
def main() -> int:
|
|
try:
|
|
summary = asyncio.run(verify_seed_data())
|
|
except VerifyError as exc:
|
|
print(f"Seed verification failed: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
print(json.dumps(summary, indent=2, sort_keys=True))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|