KohakuHub/scripts/dev/seed_demo_data.py

#!/usr/bin/env python3
"""Create deterministic local demo data through KohakuHub's API surface."""

from __future__ import annotations

import asyncio
import base64
import hashlib
import io
import json
import math
import sys
import tarfile
import tempfile
import textwrap
from collections.abc import Callable, Iterable
from contextlib import AsyncExitStack
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import urlsplit

import httpx
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import pyarrow as pa
import pyarrow.parquet as pq
import requests
from hfutils import index as hf_index
from safetensors.numpy import save as save_safetensors
from seed_shared import SEED_VERSION

ROOT_DIR = Path(__file__).resolve().parents[2]
SRC_DIR = ROOT_DIR / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from kohakuhub.config import cfg
from kohakuhub.main import app
from kohakuhub.utils.s3 import init_storage

DEFAULT_PASSWORD = "KohakuDev123!"
PRIMARY_USERNAME = "mai_lin"
MANIFEST_PATH = ROOT_DIR / "hub-meta" / "dev" / "demo-seed-manifest.json"
INTERNAL_BASE_URL = (
    getattr(cfg.app, "internal_base_url", None)
    or cfg.app.base_url
    or "http://127.0.0.1:48888"
)


class SeedError(RuntimeError):
    """Raised when demo data creation fails."""


@dataclass(frozen=True)
class AccountSeed:
    username: str
    email: str
    full_name: str
    bio: str
    website: str
    social_media: dict[str, str]
    avatar_bg: str
    avatar_accent: str


@dataclass(frozen=True)
class OrganizationSeed:
    name: str
    description: str
    bio: str
    website: str
    social_media: dict[str, str]
    avatar_bg: str
    avatar_accent: str
    members: tuple[tuple[str, str], ...]


@dataclass(frozen=True)
class CommitSeed:
    summary: str
    description: str
    files: tuple["SeedFile", ...]


@dataclass(frozen=True)
class FileSeed:
    path: str
    content: bytes | Callable[[], bytes]


@dataclass(frozen=True)
class RepoSeed:
    actor: str
    repo_type: str
    namespace: str
    name: str
    private: bool
    commits: tuple[CommitSeed, ...]
    branch: str | None = None
    tag: str | None = None
    download_path: str | None = None
    download_sessions: int = 0


SeedFile = tuple[str, bytes] | FileSeed


@dataclass(frozen=True)
class RemoteAsset:
    cache_name: str
    url: str
    sha256: str
    source_url: str


SEED_ASSET_CACHE_DIR = ROOT_DIR / "hub-meta" / "cache" / "seed-assets"


ACCOUNTS: tuple[AccountSeed, ...] = (
    AccountSeed(
        username="mai_lin",
        email="mai.lin@kohakuhub.dev",
        full_name="Mai Lin",
        bio=(
            "Product-minded ML engineer focused on reproducible dataset QA, "
            "small-model packaging, and local debugging workflows."
        ),
        website="https://kohakuhub.local/mai-lin",
        social_media={
            "github": "mai-lin-labs",
            "huggingface": "mai-lin-labs",
            "twitter_x": "mai_lin_ops",
        },
        avatar_bg="#183153",
        avatar_accent="#f59e0b",
    ),
    AccountSeed(
        username="leo_park",
        email="leo.park@kohakuhub.dev",
        full_name="Leo Park",
        bio=(
            "Frontend-heavy engineer who keeps repo demos honest with browser "
            "smoke tests and hand-curated example data."
        ),
        website="https://kohakuhub.local/leo-park",
        social_media={
            "github": "leo-park-dev",
            "threads": "leo.park.dev",
        },
        avatar_bg="#0f766e",
        avatar_accent="#f8fafc",
    ),
    AccountSeed(
        username="sara_chen",
        email="sara.chen@kohakuhub.dev",
        full_name="Sara Chen",
        bio=(
            "Annotation lead for invoice, receipt, and layout-heavy datasets. "
            "Prefers clean schemas over magical post-processing."
        ),
        website="https://kohakuhub.local/sara-chen",
        social_media={
            "github": "sara-chen-data",
            "huggingface": "sara-chen-data",
        },
        avatar_bg="#7c2d12",
        avatar_accent="#fde68a",
    ),
    AccountSeed(
        username="noah_kim",
        email="noah.kim@kohakuhub.dev",
        full_name="Noah Kim",
        bio=(
            "Ships compact vision models for harbor monitoring, segmentation, "
            "and camera-side smoke testing."
        ),
        website="https://kohakuhub.local/noah-kim",
        social_media={
            "github": "noah-kim-vision",
            "twitter_x": "noahkimvision",
        },
        avatar_bg="#1d4ed8",
        avatar_accent="#dbeafe",
    ),
    AccountSeed(
        username="ivy_ops",
        email="ivy.ops@kohakuhub.dev",
        full_name="Ivy Ops",
        bio=(
            "Release and infra support. Uses stable, boring fixtures so bug "
            "reports stay reproducible."
        ),
        website="https://kohakuhub.local/ivy-ops",
        social_media={
            "github": "ivy-ops",
        },
        avatar_bg="#3f3f46",
        avatar_accent="#f4f4f5",
    ),
)

ORGANIZATIONS: tuple[OrganizationSeed, ...] = (
    OrganizationSeed(
        name="aurora-labs",
        description=(
            "Applied document intelligence team building OCR-friendly models, "
            "datasets, and lightweight internal tooling."
        ),
        bio=(
            "Aurora Labs curates multilingual OCR assets for receipts, forms, "
            "and customer-service automation."
        ),
        website="https://aurora-labs.kohakuhub.local",
        social_media={
            "github": "aurora-labs",
            "huggingface": "aurora-labs",
        },
        avatar_bg="#312e81",
        avatar_accent="#e0e7ff",
        members=(
            ("mai_lin", "super-admin"),
            ("leo_park", "admin"),
            ("sara_chen", "member"),
            ("ivy_ops", "visitor"),
        ),
    ),
    OrganizationSeed(
        name="harbor-vision",
        description=(
            "Small computer-vision team for coastal monitoring, dock safety, "
            "and camera-ready deployment checks."
        ),
        bio=(
            "Harbor Vision maintains compact segmentation and inspection models "
            "for edge-friendly marine operations."
        ),
        website="https://harbor-vision.kohakuhub.local",
        social_media={
            "github": "harbor-vision",
            "twitter_x": "harborvision",
        },
        avatar_bg="#0f766e",
        avatar_accent="#ccfbf1",
        members=(
            ("mai_lin", "super-admin"),
            ("noah_kim", "super-admin"),
            ("leo_park", "visitor"),
        ),
    ),
)


def build_scale_accounts() -> tuple[AccountSeed, ...]:
    specs = (
        (
            "mila_zhou",
            "Mila Zhou",
            "Dataset release engineer focused on parquet validation, shard manifests, and large org operations.",
            "mila-zhou-data",
            "#4c1d95",
            "#ede9fe",
        ),
        (
            "ethan_reed",
            "Ethan Reed",
            "Model packaging owner who keeps tokenizer assets, shard indexes, and release notes tidy.",
            "ethan-reed-models",
            "#0f766e",
            "#ccfbf1",
        ),
        (
            "olivia_hart",
            "Olivia Hart",
            "Benchmarks multimodal search pipelines and curates reproducible evaluation bundles.",
            "olivia-hart-ai",
            "#9a3412",
            "#ffedd5",
        ),
        (
            "liam_north",
            "Liam North",
            "Owns local demo QA for file-tree pagination, deep directory browsing, and download flows.",
            "liam-north-labs",
            "#1d4ed8",
            "#dbeafe",
        ),
        (
            "zoe_park",
            "Zoe Park",
            "Keeps audio, image, and video fixtures aligned with product demos and ingestion checks.",
            "zoe-park-media",
            "#065f46",
            "#d1fae5",
        ),
        (
            "owen_davis",
            "Owen Davis",
            "Maintains synthetic but structurally realistic model exports for offline smoke testing.",
            "owen-davis-ml",
            "#7c2d12",
            "#fed7aa",
        ),
        (
            "mia_cross",
            "Mia Cross",
            "Curates metadata-heavy datasets with stable labels and repeatable schema previews.",
            "mia-cross-data",
            "#be123c",
            "#ffe4e6",
        ),
        (
            "lucas_tan",
            "Lucas Tan",
            "Documents retrieval pipelines, indexed archives, and annotation workflows for the team.",
            "lucas-tan-docs",
            "#1e3a8a",
            "#dbeafe",
        ),
        (
            "ava_scott",
            "Ava Scott",
            "Runs browser-first QA against large org listings, search results, and activity views.",
            "ava-scott-qa",
            "#854d0e",
            "#fef3c7",
        ),
        (
            "jackson_liu",
            "Jackson Liu",
            "Tracks media indexing pipelines and long-tail file format regressions.",
            "jackson-liu-index",
            "#155e75",
            "#cffafe",
        ),
        (
            "grace_hill",
            "Grace Hill",
            "Handles org membership operations and permissions reviews for shared demo spaces.",
            "grace-hill-ops",
            "#6d28d9",
            "#ede9fe",
        ),
        (
            "henry_wu",
            "Henry Wu",
            "Maintains multilingual dataset snapshots and local release validation checklists.",
            "henry-wu-data",
            "#92400e",
            "#fef3c7",
        ),
    )

    return tuple(
        AccountSeed(
            username=username,
            email=f"{username.replace('_', '.')}@kohakuhub.dev",
            full_name=full_name,
            bio=bio,
            website=f"https://kohakuhub.local/{username.replace('_', '-')}",
            social_media={
                "github": github_handle,
                "huggingface": github_handle,
            },
            avatar_bg=avatar_bg,
            avatar_accent=avatar_accent,
        )
        for username, full_name, bio, github_handle, avatar_bg, avatar_accent in specs
    )


SCALE_ACCOUNTS = build_scale_accounts()
ACCOUNTS = ACCOUNTS + SCALE_ACCOUNTS


OPEN_MEDIA_MEMBERS: tuple[tuple[str, str], ...] = (
    ("mai_lin", "super-admin"),
    ("leo_park", "admin"),
    ("sara_chen", "admin"),
    ("ivy_ops", "admin"),
    ("noah_kim", "member"),
    ("mila_zhou", "admin"),
    ("ethan_reed", "member"),
    ("olivia_hart", "member"),
    ("liam_north", "member"),
    ("zoe_park", "member"),
    ("owen_davis", "member"),
    ("mia_cross", "member"),
    ("lucas_tan", "member"),
    ("ava_scott", "visitor"),
    ("jackson_liu", "member"),
    ("grace_hill", "visitor"),
    ("henry_wu", "member"),
)

ORGANIZATIONS = ORGANIZATIONS + (
    OrganizationSeed(
        name="open-media-lab",
        description=(
            "Shared local-dev org packed with multimodal fixtures, large repo lists, "
            "and high-member-count collaboration scenarios."
        ),
        bio=(
            "Open Media Lab maintains reproducible multimodal assets for UI browsing, "
            "download tracking, metadata QA, and repository management demos."
        ),
        website="https://open-media-lab.kohakuhub.local",
        social_media={
            "github": "open-media-lab",
            "huggingface": "open-media-lab",
        },
        avatar_bg="#0f172a",
        avatar_accent="#bae6fd",
        members=OPEN_MEDIA_MEMBERS,
    ),
)


SAFEBOORU_IMAGE_ASSETS: tuple[RemoteAsset, ...] = (
    RemoteAsset(
        cache_name="safebooru-canal-reflections.png",
        url="https://cdn.donmai.us/original/79/a6/79a6c565714b36c5689131085d70a8a2.png",
        sha256="4b0b07d9f6d2658346525326567f4db7aebeae8b2ade4facb0f56f9972bdb669",
        source_url="https://safebooru.donmai.us/posts/11208212",
    ),
    RemoteAsset(
        cache_name="safebooru-mountain-church.jpg",
        url="https://cdn.donmai.us/original/dc/d4/dcd4a809e6efc402363720a6714bc4f7.jpg",
        sha256="a688df893449c757d979ff877aa1a3f006de649686ed0f5b101e807808e1dbc7",
        source_url="https://safebooru.donmai.us/posts/11207803",
    ),
    RemoteAsset(
        cache_name="safebooru-sand-plain.jpg",
        url="https://cdn.donmai.us/original/e8/20/e8201ebfcf9802fd5b74f126ae501406.jpg",
        sha256="14420b7849ab8922914d2ccc5d32abbf25ae26642ea50dfbb15096a8d9e85503",
        source_url="https://safebooru.donmai.us/posts/11207788",
    ),
    RemoteAsset(
        cache_name="safebooru-fence-field.jpg",
        url="https://cdn.donmai.us/original/5d/28/5d2833c4731c2b8631eefe5f89cd2541.jpg",
        sha256="e7eec10df1393ee661da300612b84cc4b0f8052d54aae4244cddaaaeb50a3d79",
        source_url="https://safebooru.donmai.us/posts/11207775",
    ),
    RemoteAsset(
        cache_name="safebooru-forest-lake.jpg",
        url="https://cdn.donmai.us/original/08/33/08330cb79116cd7dd1000f702b28c4f3.jpg",
        sha256="565520f058666a04953a1cbc8db67b2687fde240bb26b29d9b1008f562d78aa6",
        source_url="https://safebooru.donmai.us/posts/11207641",
    ),
    RemoteAsset(
        cache_name="safebooru-fantasy-castle.jpg",
        url="https://cdn.donmai.us/original/31/45/3145abe70177f3d01150a8fa9aa692dc.jpg",
        sha256="1d52643e22021364650176ff5c47e70ee101020f3329f9cd1f44b9aad739737a",
        source_url="https://safebooru.donmai.us/posts/11207593",
    ),
    RemoteAsset(
        cache_name="safebooru-phainon-cyrene.jpg",
        url=(
            "https://cdn.donmai.us/original/29/82/"
            "__phainon_and_cyrene_honkai_and_1_more_drawn_by_whyte_srsn__"
            "298282d12b00b563a09bebb65cc11116.jpg"
        ),
        sha256="8c8e04d47dea6ba020c6f0ec96932aaf760101b1cd358ba6eb829aa908f52b2f",
        source_url="https://safebooru.donmai.us/posts/9740876",
    ),
    RemoteAsset(
        cache_name="safebooru-sunflower-field.png",
        url=(
            "https://cdn.donmai.us/original/65/dd/"
            "__shirakami_fubuki_hololive_drawn_by_hyde_tabakko__"
            "65ddfa390ca539e6f9ed9658d65c77c4.png"
        ),
        sha256="c6a157e11758d8b1584502f772f1300c2a0b9e00ba7d9d883fd6b24b247181c0",
        source_url="https://safebooru.donmai.us/posts/9779697",
    ),
    RemoteAsset(
        cache_name="safebooru-grass-wonder.jpg",
        url=(
            "https://cdn.donmai.us/original/f9/5f/"
            "__grass_wonder_umamusume_and_1_more_drawn_by_fuuseppu__"
            "f95f1c3cdc9e69d9f2de613dc8117df2.jpg"
        ),
        sha256="35d08757090287d2fa465cc7ab959829b3df03c18e254580fc6ecbb8dc1cb118",
        source_url="https://safebooru.donmai.us/posts/9658576",
    ),
    RemoteAsset(
        cache_name="safebooru-paper-boat.jpg",
        url=(
            "https://cdn.donmai.us/original/f2/66/"
            "__sameko_saba_indie_virtual_youtuber_drawn_by_sky_above_me__"
            "f2664dc9d6a90473cf49234a3f30bea1.jpg"
        ),
        sha256="ae20506f36504895708fe1c85979c1dede228571044457bd5e91daaa1415ce7e",
        source_url="https://safebooru.donmai.us/posts/9599213",
    ),
)

REMOTE_MEDIA_ASSETS: dict[str, RemoteAsset] = {
    asset.cache_name: asset
    for asset in (
        *SAFEBOORU_IMAGE_ASSETS,
        RemoteAsset(
            cache_name="voices-speech.wav",
            url=(
                "https://download.pytorch.org/torchaudio/tutorial-assets/"
                "Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
            ),
            sha256="c65fcd726d6b08c82c1e5dc7558f863cd8d483e3ed2f4a7bcf271dc1865ada14",
            source_url=(
                "https://download.pytorch.org/torchaudio/tutorial-assets/"
                "Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
            ),
        ),
        RemoteAsset(
            cache_name="steam-train-whistle.wav",
            url=(
                "https://download.pytorch.org/torchaudio/tutorial-assets/"
                "steam-train-whistle-daniel_simon.wav"
            ),
            sha256="762b6783be7f20aa8be03812eeb33184bb5b1497db7422607a70b5d441fc45e9",
            source_url=(
                "https://download.pytorch.org/torchaudio/tutorial-assets/"
                "steam-train-whistle-daniel_simon.wav"
            ),
        ),
        RemoteAsset(
            cache_name="opencv-vtest.avi",
            url="https://raw.githubusercontent.com/opencv/opencv/4.x/samples/data/vtest.avi",
            sha256="45cddc9490be69345cbdab64ca583be65987e864ca408038e648db99e10516cf",
            source_url="https://github.com/opencv/opencv/blob/4.x/samples/data/vtest.avi",
        ),
        # Real HF-hosted fixtures used to exercise the pure-client preview
        # path (issue #27). Both files are small (~500 KB each), pinned by
        # sha256, and sourced from long-stable public HF test artifacts so
        # the seed stays deterministic across runs.
        RemoteAsset(
            cache_name="hf-tiny-random-bert.safetensors",
            url="https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/model.safetensors",
            sha256="965f02b6a7e5520fc12f710e4e3b6132f697f1c8f648819553c5ade86752d2de",
            source_url="https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/model.safetensors",
        ),
        RemoteAsset(
            cache_name="hf-no-robots-test.parquet",
            url="https://huggingface.co/datasets/HuggingFaceH4/no_robots/resolve/main/data/test-00000-of-00001.parquet",
            sha256="60707b2636a46e37bb0c1e9ca263a18553f430317b7a53c691676d6a492fc0f2",
            source_url="https://huggingface.co/datasets/HuggingFaceH4/no_robots/blob/main/data/test-00000-of-00001.parquet",
        ),
    )
}


def text_bytes(body: str) -> bytes:
    return (textwrap.dedent(body).strip() + "\n").encode("utf-8")


def json_bytes(payload: dict | list) -> bytes:
    return (json.dumps(payload, indent=2, sort_keys=True) + "\n").encode("utf-8")


def csv_bytes(rows: Iterable[Iterable[str]]) -> bytes:
    lines = [",".join(row) for row in rows]
    return ("\n".join(lines) + "\n").encode("utf-8")


def jsonl_bytes(rows: Iterable[dict]) -> bytes:
    return ("\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n").encode(
        "utf-8"
    )


def profile_space_files(title: str, summary: str, accent: str) -> tuple[tuple[str, bytes], ...]:
    return (
        (
            "README.md",
            text_bytes(
                f"""
                ---
                title: {title}
                emoji: "\u2605"
                colorFrom: indigo
                colorTo: amber
                sdk: gradio
                sdk_version: "4.44.0"
                ---

                # {title}

                {summary}

                This space exists so local profile pages render with realistic content
                instead of an empty placeholder repository.
                """
            ),
        ),
        (
            "app.py",
            text_bytes(
                f"""
                import gradio as gr

                demo = gr.Interface(
                    fn=lambda text: "{title}: " + text.strip(),
                    inputs=gr.Textbox(label="Prompt"),
                    outputs=gr.Textbox(label="Response"),
                    title="{title}",
                    description="{summary}",
                    theme=gr.themes.Soft(primary_hue="{accent}"),
                )

                if __name__ == "__main__":
                    demo.launch()
                """
            ),
        ),
        ("requirements.txt", text_bytes("gradio>=4.44.0")),
    )


def seed_file(path: str, content: bytes | Callable[[], bytes]) -> FileSeed:
    return FileSeed(path=path, content=content)


def materialize_seed_file(file_entry: SeedFile) -> tuple[str, bytes]:
    if isinstance(file_entry, FileSeed):
        content = file_entry.content() if callable(file_entry.content) else file_entry.content
        return file_entry.path, content
    return file_entry


_ASSET_BYTES_CACHE: dict[str, bytes] = {}


def patterned_bytes(label: str, size_bytes: int, *, header: bytes = b"") -> bytes:
    if size_bytes <= len(header):
        return header[:size_bytes]

    pattern = bytearray()
    counter = 0
    while len(pattern) < 4096:
        pattern.extend(hashlib.sha256(f"{label}:{counter}".encode("utf-8")).digest())
        counter += 1

    body_size = size_bytes - len(header)
    repeated = (bytes(pattern) * math.ceil(body_size / len(pattern)))[:body_size]
    return header + repeated


def sha256_hex(data: bytes) -> str:
    return hashlib.sha256(data).hexdigest()


def fetch_remote_asset(asset: RemoteAsset) -> bytes:
    cached = _ASSET_BYTES_CACHE.get(asset.cache_name)
    if cached is not None:
        return cached

    cache_path = SEED_ASSET_CACHE_DIR / asset.cache_name
    if cache_path.is_file():
        data = cache_path.read_bytes()
        if sha256_hex(data) == asset.sha256:
            _ASSET_BYTES_CACHE[asset.cache_name] = data
            return data

    cache_path.parent.mkdir(parents=True, exist_ok=True)
    response = requests.get(
        asset.url,
        timeout=180,
        headers={"User-Agent": "KohakuHubLocalSeed/1.0"},
    )
    response.raise_for_status()
    data = response.content
    actual_sha256 = sha256_hex(data)
    if actual_sha256 != asset.sha256:
        raise SeedError(
            f"Remote asset hash mismatch for {asset.cache_name}: "
            f"expected {asset.sha256}, got {actual_sha256}"
        )

    tmp_path = cache_path.with_suffix(f"{cache_path.suffix}.part")
    tmp_path.write_bytes(data)
    tmp_path.replace(cache_path)
    _ASSET_BYTES_CACHE[asset.cache_name] = data
    return data


def remote_asset_bytes(asset_name: str) -> bytes:
    return fetch_remote_asset(REMOTE_MEDIA_ASSETS[asset_name])


def make_realistic_float16_tensor(label: str, shape: tuple[int, ...]) -> np.ndarray:
    element_count = math.prod(shape)
    raw_values = np.frombuffer(patterned_bytes(label, element_count * 2), dtype="<u2").copy()
    raw_values = (raw_values & np.uint16(0x03FF)) | np.uint16(0x3C00)
    return np.ascontiguousarray(raw_values.view(np.float16).reshape(shape))


def make_safetensors_bytes(
    label: str,
    tensor_specs: tuple[tuple[str, tuple[int, ...]], ...],
    *,
    metadata: dict[str, str] | None = None,
) -> tuple[bytes, int]:
    tensors: dict[str, np.ndarray] = {}
    total_tensor_bytes = 0

    for tensor_name, shape in tensor_specs:
        tensor = make_realistic_float16_tensor(f"{label}:{tensor_name}", shape)
        tensors[tensor_name] = tensor
        total_tensor_bytes += tensor.nbytes

    payload = save_safetensors(
        tensors,
        metadata={
            "format": "pt",
            "seed_label": label,
            **(metadata or {}),
        },
    )
    return payload, total_tensor_bytes


def make_single_checkpoint_bytes(
    label: str,
    tensor_specs: tuple[tuple[str, tuple[int, ...]], ...],
) -> bytes:
    payload, _ = make_safetensors_bytes(label, tensor_specs)
    return payload


def make_parquet_bytes(
    label: str,
    *,
    row_count: int = 12000,
    payload_size: int = 2048,
) -> bytes:
    base_payload = patterned_bytes(f"{label}-payload", payload_size)
    payloads = []
    sample_ids = []
    captions = []
    durations = []
    for row_index in range(row_count):
        prefix = f"{label}:{row_index:05d}|".encode("utf-8")
        payloads.append(prefix + base_payload[: payload_size - len(prefix)])
        sample_ids.append(f"{label}_{row_index:05d}")
        captions.append(
            f"{label} multimodal benchmark row {row_index:05d} for local dataset preview checks."
        )
        durations.append(round(1.5 + (row_index % 11) * 0.25, 3))

    table = pa.table(
        {
            "sample_id": pa.array(sample_ids, type=pa.string()),
            "caption": pa.array(captions, type=pa.string()),
            "duration_seconds": pa.array(durations, type=pa.float32()),
            "payload": pa.array(payloads, type=pa.binary()),
        }
    )

    buffer = io.BytesIO()
    pq.write_table(
        table,
        buffer,
        compression="NONE",
        use_dictionary=False,
        row_group_size=512,
    )
    return buffer.getvalue()


def make_indexed_tar_bundle(
    label: str,
    files: tuple[tuple[str, bytes], ...],
) -> tuple[bytes, bytes]:
    tar_buffer = io.BytesIO()
    with tarfile.open(fileobj=tar_buffer, mode="w") as handle:
        for path, content in files:
            info = tarfile.TarInfo(name=path)
            info.size = len(content)
            info.mode = 0o644
            info.mtime = 0
            info.uid = 0
            info.gid = 0
            info.uname = ""
            info.gname = ""
            handle.addfile(info, io.BytesIO(content))

    tar_bytes = tar_buffer.getvalue()
    with tempfile.TemporaryDirectory(prefix="kohakuhub-seed-tar-") as tmp_dir:
        tar_path = Path(tmp_dir) / f"{label}.tar"
        tar_path.write_bytes(tar_bytes)
        index_info = hf_index.tar_get_index_info(str(tar_path), silent=True)

    index_bytes = json_bytes(index_info)
    return tar_bytes, index_bytes


def make_deep_tree_files(label: str) -> tuple[SeedFile, ...]:
    files: list[SeedFile] = []
    for section in range(1, 7):
        for shard in range(1, 9):
            for leaf in range(1, 7):
                path = (
                    f"catalog/section-{section:02d}/tier-{shard:02d}/"
                    f"branch-{leaf:02d}/node-{section:02d}-{shard:02d}-{leaf:02d}/"
                    f"entry-{section:02d}-{shard:02d}-{leaf:02d}.json"
                )
                files.append(
                    (
                        path,
                        json_bytes(
                            {
                                "checksum": hashlib.sha256(path.encode("utf-8")).hexdigest(),
                                "fixture": label,
                                "leaf": leaf,
                                "section": section,
                                "shard": shard,
                            }
                        ),
                    )
                )

    files.extend(
        (
            (
                "README.md",
                text_bytes(
                    """
                    # hierarchy-crawl-fixtures

                    This repo intentionally contains many files and deep path nesting so
                    local tree browsing, pagination, and search remain easy to exercise.
                    """
                ),
            ),
            (
                "manifests/root-index.json",
                json_bytes(
                    {
                        "depth": 4,
                        "generated_files": len(files),
                        "label": label,
                    }
                ),
            ),
        )
    )
    return tuple(files)


def build_repo_seeds() -> tuple[RepoSeed, ...]:
    return (
        RepoSeed(
            actor="mai_lin",
            repo_type="model",
            namespace="mai_lin",
            name="lineart-caption-base",
            private=False,
            commits=(
                CommitSeed(
                    summary="Bootstrap base caption model",
                    description=(
                        "Create the public demo model repo with a realistic README, "
                        "lightweight config, and a small LFS-tracked checkpoint."
                    ),
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: mit
                                library_name: transformers
                                pipeline_tag: image-to-text
                                tags:
                                  - captioning
                                  - line-art
                                  - document-vision
                                ---

                                # lineart-caption-base

                                A compact caption model tuned for monochrome line art,
                                icon-heavy diagrams, and OCR-adjacent illustrations.

                                ## Intended use

                                - draft captions for internal QA dashboards
                                - generate quick prompts for reviewers
                                - validate frontend metadata rendering
                                """
                            ),
                        ),
                        (
                            "config.json",
                            json_bytes(
                                {
                                    "architectures": ["VisionEncoderDecoderModel"],
                                    "decoder_layers": 6,
                                    "encoder_layers": 12,
                                    "image_size": 448,
                                    "model_type": "lineart-caption-base",
                                    "vocab_size": 32000,
                                }
                            ),
                        ),
                        (
                            "tokenizer.json",
                            json_bytes(
                                {
                                    "added_tokens": [],
                                    "normalizer": {"type": "NFKC"},
                                    "pre_tokenizer": {"type": "Whitespace"},
                                    "version": "1.0",
                                }
                            ),
                        ),
                        ("examples/prompt.txt", text_bytes("Describe the icon, layout, and visible text.")),
                        seed_file(
                            "checkpoints/lineart-caption-base.safetensors",
                            lambda: make_single_checkpoint_bytes(
                                "lineart-caption-base",
                                (
                                    (
                                        "encoder.vision_model.embeddings.patch_embedding.weight",
                                        (4096, 1024),
                                    ),
                                    ("decoder.model.embed_tokens.weight", (1024, 768)),
                                ),
                            ),
                        ),
                    ),
                ),
                CommitSeed(
                    summary="Add eval notes and release metrics",
                    description="Follow-up commit so commit history and file updates are visible in local UI.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: mit
                                library_name: transformers
                                pipeline_tag: image-to-text
                                tags:
                                  - captioning
                                  - line-art
                                  - document-vision
                                ---

                                # lineart-caption-base

                                A compact caption model tuned for monochrome line art,
                                icon-heavy diagrams, and OCR-adjacent illustrations.

                                ## Current release

                                - validation CIDEr: 1.38
                                - latency target: <120 ms on local A10G
                                - known gap: dense legends still need manual review
                                """
                            ),
                        ),
                        (
                            "eval/metrics.json",
                            json_bytes(
                                {
                                    "cider": 1.38,
                                    "clip_score": 0.284,
                                    "latency_ms_p50": 87,
                                    "latency_ms_p95": 114,
                                }
                            ),
                        ),
                        (
                            "docs/training-notes.md",
                            text_bytes(
                                """
                                # Training Notes

                                - Base corpus: 82k internal line-art render pairs
                                - Additional hard negatives: 4k cluttered signage crops
                                - Checkpoint exported for small-batch browser smoke tests
                                """
                            ),
                        ),
                    ),
                ),
            ),
            branch="ablation-notes",
            tag="v0.2.1",
            download_path="checkpoints/lineart-caption-base.safetensors",
            download_sessions=4,
        ),
        RepoSeed(
            actor="mai_lin",
            repo_type="dataset",
            namespace="mai_lin",
            name="street-sign-zh-en",
            private=False,
            commits=(
                CommitSeed(
                    summary="Import bilingual street sign dataset",
                    description="Seed a CSV-backed dataset that exercises dataset preview and tree views.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: cc-by-4.0
                                task_categories:
                                  - image-text-to-text
                                language:
                                  - zh
                                  - en
                                pretty_name: Street Sign ZH EN
                                ---

                                # street-sign-zh-en

                                A small bilingual dataset for OCR-friendly sign translation and
                                layout QA. Rows keep the original text, translation, and scene tag.
                                """
                            ),
                        ),
                        (
                            "data/train.csv",
                            csv_bytes(
                                (
                                    ("image", "text_zh", "text_en", "scene"),
                                    ("img_0001.png", "\u5317\u4eac\u7ad9", "Beijing Railway Station", "station"),
                                    ("img_0002.png", "\u5c0f\u5fc3\u53f0\u9636", "Watch Your Step", "retail"),
                                    ("img_0003.png", "\u7981\u6b62\u5438\u70df", "No Smoking", "hospital"),
                                    ("img_0004.png", "\u53f3\u8f6c\u8f66\u9053", "Right Turn Only", "road"),
                                )
                            ),
                        ),
                        (
                            "data/validation.csv",
                            csv_bytes(
                                (
                                    ("image", "text_zh", "text_en", "scene"),
                                    ("val_0001.png", "\u51fa\u53e3", "Exit", "mall"),
                                    ("val_0002.png", "\u670d\u52a1\u53f0", "Service Desk", "airport"),
                                )
                            ),
                        ),
                        (
                            "metadata/features.json",
                            json_bytes(
                                {
                                    "image": "string",
                                    "text_zh": "string",
                                    "text_en": "string",
                                    "scene": "string",
                                }
                            ),
                        ),
                    ),
                ),
                CommitSeed(
                    summary="Add preview samples for dataset viewer",
                    description="Include JSONL samples and notebook notes for local bug reproduction.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: cc-by-4.0
                                task_categories:
                                  - image-text-to-text
                                language:
                                  - zh
                                  - en
                                pretty_name: Street Sign ZH EN
                                ---

                                # street-sign-zh-en

                                A small bilingual dataset for OCR-friendly sign translation and
                                layout QA. Rows keep the original text, translation, and scene tag.

                                ## Notes

                                Validation rows intentionally mix transport, retail, and public
                                service scenarios so sorting and filtering bugs are easier to spot.
                                """
                            ),
                        ),
                        (
                            "previews/samples.jsonl",
                            jsonl_bytes(
                                (
                                    {
                                        "image": "img_0001.png",
                                        "text_zh": "\u5317\u4eac\u7ad9",
                                        "text_en": "Beijing Railway Station",
                                        "scene": "station",
                                    },
                                    {
                                        "image": "img_0002.png",
                                        "text_zh": "\u5c0f\u5fc3\u53f0\u9636",
                                        "text_en": "Watch Your Step",
                                        "scene": "retail",
                                    },
                                )
                            ),
                        ),
                        (
                            "notebooks/README.md",
                            text_bytes(
                                """
                                # Notebook Notes

                                This dataset is intentionally tiny in local dev. The point is to
                                exercise preview, pagination, and schema rendering without waiting
                                on a large bootstrap import.
                                """
                            ),
                        ),
                    ),
                ),
            ),
            branch="qa-pass",
            tag="2026-04-demo",
            download_path="data/train.csv",
            download_sessions=8,
        ),
        RepoSeed(
            actor="mai_lin",
            repo_type="space",
            namespace="mai_lin",
            name="mai_lin",
            private=False,
            commits=(
                CommitSeed(
                    summary="Create profile showcase space",
                    description="Provide a same-name space so local profile pages render a realistic card.",
                    files=profile_space_files(
                        "Mai Lin Workspace",
                        "Small utilities and pinned demos used for local reproduction.",
                        "amber",
                    ),
                ),
                CommitSeed(
                    summary="Add profile theme preset",
                    description="A second commit makes the space history non-empty for UI testing.",
                    files=(
                        (
                            "assets/theme.json",
                            json_bytes(
                                {
                                    "accent": "amber",
                                    "layout": "split",
                                    "panels": ["repos", "activity", "notes"],
                                }
                            ),
                        ),
                    ),
                ),
            ),
        ),
        RepoSeed(
            actor="mai_lin",
            repo_type="dataset",
            namespace="mai_lin",
            name="internal-evals",
            private=True,
            commits=(
                CommitSeed(
                    summary="Seed private eval artifacts",
                    description="Keep one private user-owned repo for auth and permission checks.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                # internal-evals

                                Private staging area for eval summaries and failure-case review.
                                This repo is intentionally private and only accessible to Mai.
                                """
                            ),
                        ),
                        (
                            "runs/2026-04-15-summary.json",
                            json_bytes(
                                {
                                    "caption_regressions": 7,
                                    "dataset": "street-sign-zh-en",
                                    "notes": "False positives cluster around mirrored storefront text.",
                                }
                            ),
                        ),
                        (
                            "data/failure_cases.jsonl",
                            jsonl_bytes(
                                (
                                    {
                                        "file": "eval_001.png",
                                        "issue": "mirror_text",
                                        "severity": "medium",
                                    },
                                    {
                                        "file": "eval_002.png",
                                        "issue": "crowded_legend",
                                        "severity": "high",
                                    },
                                )
                            ),
                        ),
                    ),
                ),
                CommitSeed(
                    summary="Add reviewer checklist",
                    description="Second commit for commit-history coverage on a private repo.",
                    files=(
                        (
                            "notes/reviewer-checklist.md",
                            text_bytes(
                                """
                                # Reviewer Checklist

                                - confirm sample renders in dataset viewer
                                - compare translated text against bilingual CSV rows
                                - log UI regressions with the seeded repo name
                                """
                            ),
                        ),
                    ),
                ),
            ),
            download_path="runs/2026-04-15-summary.json",
            download_sessions=1,
        ),
        RepoSeed(
            actor="mai_lin",
            repo_type="space",
            namespace="aurora-labs",
            name="aurora-labs",
            private=False,
            commits=(
                CommitSeed(
                    summary="Create org showcase space",
                    description="Same-name org space keeps organization profile pages representative.",
                    files=profile_space_files(
                        "Aurora Labs Demo Portal",
                        "Landing page for OCR demos, pinned datasets, and release notes.",
                        "indigo",
                    ),
                ),
                CommitSeed(
                    summary="Add roadmap note",
                    description="A lightweight follow-up commit for org space history.",
                    files=(
                        (
                            "docs/roadmap.md",
                            text_bytes(
                                """
                                # Local Demo Roadmap

                                - tighten OCR-lite benchmark reporting
                                - keep receipt-layout-bench labels stable for bug repro
                                - mirror one private support model for permission testing
                                """
                            ),
                        ),
                    ),
                ),
            ),
        ),
        RepoSeed(
            actor="mai_lin",
            repo_type="model",
            namespace="aurora-labs",
            name="aurora-ocr-lite",
            private=False,
            commits=(
                CommitSeed(
                    summary="Publish OCR-lite baseline",
                    description="Public model repo with LFS checkpoint and readable metadata.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: apache-2.0
                                library_name: transformers
                                pipeline_tag: image-to-text
                                tags:
                                  - ocr
                                  - receipts
                                  - multilingual
                                ---

                                # aurora-ocr-lite

                                An OCR-focused checkpoint for receipt snippets, payment slips,
                                and service counter paperwork.
                                """
                            ),
                        ),
                        (
                            "config.json",
                            json_bytes(
                                {
                                    "backbone": "vit-small-patch16-384",
                                    "decoder": "bart-base",
                                    "max_position_embeddings": 512,
                                    "torch_dtype": "float16",
                                }
                            ),
                        ),
                        (
                            "vocab.txt",
                            text_bytes(
                                """
                                [PAD]
                                [UNK]
                                total
                                subtotal
                                tax
                                cashier
                                paid
                                """
                            ),
                        ),
                        seed_file(
                            "checkpoints/aurora-ocr-lite.safetensors",
                            lambda: make_single_checkpoint_bytes(
                                "aurora-ocr-lite",
                                (
                                    ("encoder.patch_embed.proj.weight", (6144, 1024)),
                                    ("decoder.model.embed_tokens.weight", (2048, 1024)),
                                ),
                            ),
                        ),
                    ),
                ),
                CommitSeed(
                    summary="Add benchmark export and release notes",
                    description="Keep one public org model slightly more active for trending and history views.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: apache-2.0
                                library_name: transformers
                                pipeline_tag: image-to-text
                                tags:
                                  - ocr
                                  - receipts
                                  - multilingual
                                ---

                                # aurora-ocr-lite

                                An OCR-focused checkpoint for receipt snippets, payment slips,
                                and service counter paperwork.

                                ## Release notes

                                - reduced hallucinated currency markers on narrow receipt crops
                                - added benchmark export used by the admin dashboard smoke tests
                                """
                            ),
                        ),
                        (
                            "eval/benchmark.json",
                            json_bytes(
                                {
                                    "cer": 0.081,
                                    "wer": 0.119,
                                    "latency_ms_p50": 64,
                                    "latency_ms_p95": 92,
                                }
                            ),
                        ),
                        (
                            "scripts/export_notes.md",
                            text_bytes(
                                """
                                # Export Notes

                                Checkpoint is intentionally small and fake. It only exists so local
                                flows hit LFS, quota, and file-tree code paths.
                                """
                            ),
                        ),
                    ),
                ),
            ),
            branch="benchmark-v2",
            tag="v0.3.0",
            download_path="checkpoints/aurora-ocr-lite.safetensors",
            download_sessions=12,
        ),
        RepoSeed(
            actor="leo_park",
            repo_type="dataset",
            namespace="aurora-labs",
            name="receipt-layout-bench",
            private=False,
            commits=(
                CommitSeed(
                    summary="Create receipt layout benchmark",
                    description="Public dataset repo with JSONL splits for dataset preview coverage.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: cc-by-4.0
                                pretty_name: Receipt Layout Bench
                                task_categories:
                                  - token-classification
                                ---

                                # receipt-layout-bench

                                Annotation benchmark for merchant, total, tax, and timestamp spans.
                                """
                            ),
                        ),
                        (
                            "splits/train.jsonl",
                            jsonl_bytes(
                                (
                                    {
                                        "image": "train_0001.png",
                                        "merchant": "North Pier Cafe",
                                        "total": "18.40",
                                        "currency": "USD",
                                    },
                                    {
                                        "image": "train_0002.png",
                                        "merchant": "River Town Mart",
                                        "total": "42.15",
                                        "currency": "USD",
                                    },
                                )
                            ),
                        ),
                        (
                            "splits/test.jsonl",
                            jsonl_bytes(
                                (
                                    {
                                        "image": "test_0001.png",
                                        "merchant": "Airport Bento",
                                        "total": "9.80",
                                        "currency": "USD",
                                    },
                                    {
                                        "image": "test_0002.png",
                                        "merchant": "Harbor Books",
                                        "total": "27.10",
                                        "currency": "USD",
                                    },
                                )
                            ),
                        ),
                        (
                            "schema/fields.json",
                            json_bytes(
                                {
                                    "merchant": "string",
                                    "total": "string",
                                    "currency": "string",
                                    "timestamp": "string",
                                }
                            ),
                        ),
                    ),
                ),
                CommitSeed(
                    summary="Add annotation guide",
                    description="Second dataset commit for history, tree diffing, and docs rendering.",
                    files=(
                        (
                            "docs/annotation-guide.md",
                            text_bytes(
                                """
                                # Annotation Guide

                                - mark printed totals, not handwritten notes
                                - keep currency in a dedicated field
                                - preserve merchant spelling from source image
                                """
                            ),
                        ),
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: cc-by-4.0
                                pretty_name: Receipt Layout Bench
                                task_categories:
                                  - token-classification
                                ---

                                # receipt-layout-bench

                                Annotation benchmark for merchant, total, tax, and timestamp spans.

                                The local seed intentionally mixes neat and messy receipts to cover
                                pagination, filters, and table previews.
                                """
                            ),
                        ),
                    ),
                ),
            ),
            branch="supplier-a-refresh",
            tag="v1.0.0",
            download_path="splits/test.jsonl",
            download_sessions=5,
        ),
        RepoSeed(
            actor="mai_lin",
            repo_type="model",
            namespace="aurora-labs",
            name="customer-support-rag",
            private=True,
            commits=(
                CommitSeed(
                    summary="Seed private support model workspace",
                    description="Private org repo for auth-only browsing and settings checks.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                # customer-support-rag

                                Internal-only retrieval and prompt assets for support workflows.
                                This repo is private and visible to Aurora Labs members only.
                                """
                            ),
                        ),
                        (
                            "prompt/system.txt",
                            text_bytes(
                                """
                                You are a cautious support assistant. Answer only with facts from
                                the indexed knowledge base, and cite the exact article title.
                                """
                            ),
                        ),
                        (
                            "retrieval/index-schema.json",
                            json_bytes(
                                {
                                    "article_id": "string",
                                    "channel": "string",
                                    "lang": "string",
                                    "text": "string",
                                }
                            ),
                        ),
                        (
                            "config.json",
                            json_bytes(
                                {
                                    "chunk_size": 384,
                                    "embedding_model": "bge-small-en-v1.5",
                                    "top_k": 6,
                                }
                            ),
                        ),
                    ),
                ),
                CommitSeed(
                    summary="Add ops runbook",
                    description="Keep a second private-org commit for local history inspection.",
                    files=(
                        (
                            "docs/runbook.md",
                            text_bytes(
                                """
                                # Runbook

                                - refresh embeddings weekly
                                - snapshot prompts before frontend demos
                                - record regressions against the fixed local seed data
                                """
                            ),
                        ),
                    ),
                ),
            ),
            download_path="prompt/system.txt",
            download_sessions=1,
        ),
        RepoSeed(
            actor="noah_kim",
            repo_type="model",
            namespace="harbor-vision",
            name="marine-seg-small",
            private=False,
            commits=(
                CommitSeed(
                    summary="Publish marine segmentation starter model",
                    description="Public vision model with another fake LFS checkpoint.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: apache-2.0
                                pipeline_tag: image-segmentation
                                tags:
                                  - segmentation
                                  - marine
                                  - edge
                                ---

                                # marine-seg-small

                                Compact segmentation model for harbor waterlines, safety zones,
                                and dock equipment outlines.
                                """
                            ),
                        ),
                        (
                            "config.json",
                            json_bytes(
                                {
                                    "backbone": "convnext-tiny",
                                    "classes": ["water", "dock", "vessel", "buoy"],
                                    "input_size": 512,
                                }
                            ),
                        ),
                        (
                            "labels.json",
                            json_bytes(
                                {
                                    "0": "water",
                                    "1": "dock",
                                    "2": "vessel",
                                    "3": "buoy",
                                }
                            ),
                        ),
                        seed_file(
                            "checkpoints/marine-seg-small.safetensors",
                            lambda: make_single_checkpoint_bytes(
                                "marine-seg-small",
                                (
                                    ("backbone.stem.conv1.weight", (4096, 1536)),
                                    ("decode_head.classifier.weight", (1024, 1024)),
                                ),
                            ),
                        ),
                    ),
                ),
                CommitSeed(
                    summary="Add harbor evaluation report",
                    description="Second model commit for history and stats coverage.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: apache-2.0
                                pipeline_tag: image-segmentation
                                tags:
                                  - segmentation
                                  - marine
                                  - edge
                                ---

                                # marine-seg-small

                                Compact segmentation model for harbor waterlines, safety zones,
                                and dock equipment outlines.

                                ## Eval highlights

                                - best IoU on waterline masks from overcast camera feeds
                                - weaker on stacked cargo edges during dusk
                                """
                            ),
                        ),
                        (
                            "eval/coastal-harbor.json",
                            json_bytes(
                                {
                                    "iou_dock": 0.84,
                                    "iou_vessel": 0.79,
                                    "iou_water": 0.91,
                                }
                            ),
                        ),
                    ),
                ),
            ),
            branch="saltwater-eval",
            tag="v1.1.0",
            download_path="checkpoints/marine-seg-small.safetensors",
            download_sessions=6,
        ),
        RepoSeed(
            actor="noah_kim",
            repo_type="space",
            namespace="harbor-vision",
            name="smoke-test-dashboard",
            private=True,
            commits=(
                CommitSeed(
                    summary="Create private smoke-test dashboard",
                    description="Private org space used for auth and space rendering checks.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                # smoke-test-dashboard

                                Private dashboard for camera ingest smoke tests and deployment sign-off.
                                """
                            ),
                        ),
                        (
                            "app.py",
                            text_bytes(
                                """
                                import gradio as gr

                                dashboard = gr.Interface(
                                    fn=lambda status: f"dashboard status: {status}",
                                    inputs=gr.Textbox(label="Input"),
                                    outputs=gr.Textbox(label="Output"),
                                    title="Smoke Test Dashboard",
                                )

                                if __name__ == "__main__":
                                    dashboard.launch()
                                """
                            ),
                        ),
                        ("requirements.txt", text_bytes("gradio>=4.44.0")),
                    ),
                ),
                CommitSeed(
                    summary="Add dashboard notes",
                    description="Second private-space commit for browsing stateful history locally.",
                    files=(
                        (
                            "dashboards/README.md",
                            text_bytes(
                                """
                                # Dashboard Notes

                                Fixed local fixtures are better than random telemetry when the goal
                                is to reproduce layout and auth bugs.
                                """
                            ),
                        ),
                    ),
                ),
            ),
            download_path="README.md",
            download_sessions=1,
        ),
        RepoSeed(
            actor="leo_park",
            repo_type="space",
            namespace="leo_park",
            name="formula-checker-lite",
            private=False,
            commits=(
                CommitSeed(
                    summary="Create public formula checker demo",
                    description="Lightweight public space for user profile and space listings.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                # formula-checker-lite

                                Small browser demo that validates spreadsheet-style formulas and
                                flags obviously broken references.
                                """
                            ),
                        ),
                        (
                            "app.py",
                            text_bytes(
                                """
                                import gradio as gr

                                def validate(expr: str) -> str:
                                    return "looks valid" if "=" in expr else "missing leading ="

                                demo = gr.Interface(
                                    fn=validate,
                                    inputs=gr.Textbox(label="Formula"),
                                    outputs=gr.Textbox(label="Status"),
                                    title="Formula Checker Lite",
                                )

                                if __name__ == "__main__":
                                    demo.launch()
                                """
                            ),
                        ),
                        ("requirements.txt", text_bytes("gradio>=4.44.0")),
                    ),
                ),
                CommitSeed(
                    summary="Add preset expressions",
                    description="Second commit keeps this user-owned space non-trivial.",
                    files=(
                        (
                            "assets/presets.json",
                            json_bytes(
                                {
                                    "valid": "=SUM(A1:A3)",
                                    "invalid": "SUM(A1:A3)",
                                    "cross_sheet": "=Sheet2!B4",
                                }
                            ),
                        ),
                    ),
                ),
            ),
            download_path="README.md",
            download_sessions=2,
        ),
        RepoSeed(
            actor="sara_chen",
            repo_type="dataset",
            namespace="sara_chen",
            name="invoice-entities-mini",
            private=False,
            commits=(
                CommitSeed(
                    summary="Seed invoice entity dataset",
                    description="Public user dataset so profile pages are not empty.",
                    files=(
                        (
                            "README.md",
                            text_bytes(
                                """
                                ---
                                license: cc-by-4.0
                                pretty_name: Invoice Entities Mini
                                task_categories:
                                  - token-classification
                                ---

                                # invoice-entities-mini

                                Tiny invoice entity dataset for local schema, preview, and table rendering checks.
                                """
                            ),
                        ),
                        (
                            "data/train.jsonl",
                            jsonl_bytes(
                                (
                                    {
                                        "invoice_id": "inv_1001",
                                        "vendor": "Blue Harbor Logistics",
                                        "amount": "1240.00",
                                    },
                                    {
                                        "invoice_id": "inv_1002",
                                        "vendor": "Northline Design",
                                        "amount": "315.50",
                                    },
                                )
                            ),
                        ),
                        (
                            "data/test.jsonl",
                            jsonl_bytes(
                                (
                                    {
                                        "invoice_id": "inv_2001",
                                        "vendor": "River Street Foods",
                                        "amount": "89.20",
                                    },
                                )
                            ),
                        ),
                        (
                            "schema.json",
                            json_bytes(
                                {
                                    "invoice_id": "string",
                                    "vendor": "string",
                                    "amount": "string",
                                }
                            ),
                        ),
                    ),
                ),
                CommitSeed(
                    summary="Add notebook notes",
                    description="Second public dataset commit for file tree and commit history coverage.",
                    files=(
                        (
                            "notebooks/README.md",
                            text_bytes(
                                """
                                # Notebook Notes

                                Keep the local seed tiny. If a preview bug shows up here, it is much
                                easier to reason about than a random large import.
                                """
                            ),
                        ),
                    ),
                ),
            ),
            download_path="data/train.jsonl",
            download_sessions=3,
        ),
    )


def build_open_media_core_repo_seeds() -> tuple[RepoSeed, ...]:
    archive_cache: dict[str, tuple[bytes, bytes]] = {}
    model_bundle_cache: dict[str, dict[str, bytes]] = {}

    top_level_image_assets = (
        SAFEBOORU_IMAGE_ASSETS[:4] + SAFEBOORU_IMAGE_ASSETS[-2:]
    )
    archive_image_assets = SAFEBOORU_IMAGE_ASSETS
    top_level_media_entries = (
        ("media/audio/voices-speech.wav", "voices-speech.wav"),
        ("media/audio/steam-train-whistle.wav", "steam-train-whistle.wav"),
        ("media/video/opencv-vtest.avi", "opencv-vtest.avi"),
        *(
            (f"media/images/{asset.cache_name}", asset.cache_name)
            for asset in top_level_image_assets
        ),
    )

    def archive_bundle() -> tuple[bytes, bytes]:
        cached = archive_cache.get("bundle")
        if cached is not None:
            return cached

        archived_files = tuple(
            (f"images/{asset.cache_name}", remote_asset_bytes(asset.cache_name))
            for asset in archive_image_assets
        ) + (
            (
                "annotations/captions.jsonl",
                jsonl_bytes(
                    tuple(
                        {
                            "asset": f"images/{asset.cache_name}",
                            "caption": f"SafeBooru fixture mirrored from {asset.source_url}.",
                            "source_url": asset.source_url,
                            "split": "train" if index < 6 else "validation",
                        }
                        for index, asset in enumerate(archive_image_assets)
                    )
                ),
            ),
            (
                "metadata/source-assets.json",
                json_bytes(
                    {
                        "assets": [
                            {
                                "path": f"images/{asset.cache_name}",
                                "sha256": asset.sha256,
                                "size": len(remote_asset_bytes(asset.cache_name)),
                                "source_url": asset.source_url,
                            }
                            for asset in archive_image_assets
                        ]
                    }
                ),
            ),
        )
        cached = make_indexed_tar_bundle("open-media-archive", archived_files)
        archive_cache["bundle"] = cached
        return cached

    def model_bundle() -> dict[str, bytes]:
        cached = model_bundle_cache.get("bundle")
        if cached is not None:
            return cached

        shard_specs = (
            (
                "model-00001-of-00003.safetensors",
                (
                    ("language_model.embed_tokens.weight", (7680, 4096)),
                    ("language_model.layers.0.mlp.down_proj.weight", (4096, 2048)),
                ),
            ),
            (
                "model-00002-of-00003.safetensors",
                (("language_model.layers.14.self_attn.q_proj.weight", (8192, 4096)),),
            ),
            (
                "model-00003-of-00003.safetensors",
                (
                    ("language_model.layers.27.mlp.up_proj.weight", (8192, 4096)),
                    ("vision_tower.vision_model.embeddings.class_embedding", (1, 1408)),
                ),
            ),
        )

        bundle: dict[str, bytes] = {}
        total_tensor_bytes = 0
        weight_map: dict[str, str] = {}
        for filename, tensor_specs in shard_specs:
            payload, tensor_bytes = make_safetensors_bytes(
                f"vision-language-assistant-3b:{filename}",
                tensor_specs,
            )
            bundle[filename] = payload
            total_tensor_bytes += tensor_bytes
            for tensor_name, _ in tensor_specs:
                weight_map[tensor_name] = filename

        bundle["model.safetensors.index.json"] = json_bytes(
            {
                "metadata": {"total_size": total_tensor_bytes},
                "weight_map": weight_map,
            }
        )
        model_bundle_cache["bundle"] = bundle
        return bundle

    multimodal_files: tuple[SeedFile, ...] = (
        (
            "README.md",
            text_bytes(
                """
                ---
                license: cc-by-4.0
                pretty_name: Open Media Multimodal Suite
                task_categories:
                  - automatic-speech-recognition
                  - image-to-text
                  - video-classification
                tags:
                  - parquet
                  - indexed-tar
                  - multimodal
                ---

                # multimodal-benchmark-suite

                Local benchmark dataset with real parquet shards, a hfutils.index-compatible
                tar archive, a larger SafeBooru image set, torchaudio sample WAV files, and an
                OpenCV sample video for frontend and admin demos.
                """
            ),
        ),
        (
            "dataset_infos.json",
            json_bytes(
                {
                    "default": {
                        "config_name": "default",
                        "features": {
                            "caption": {"dtype": "string", "_type": "Value"},
                            "duration_seconds": {"dtype": "float32", "_type": "Value"},
                            "payload": {"dtype": "binary", "_type": "Value"},
                            "sample_id": {"dtype": "string", "_type": "Value"},
                        },
                        "splits": {
                            "train": {
                                "name": "train",
                                "num_examples": 12000,
                            }
                        },
                    }
                }
            ),
        ),
        (
            "metadata/feature-card.json",
            json_bytes(
                {
                    "archive_index": "archives/raw-bundle-0000.json",
                    "archive_tar": "archives/raw-bundle-0000.tar",
                    "media_assets": [path for path, _ in top_level_media_entries],
                    "parquet_train": "parquet/train-00000-of-00001.parquet",
                }
            ),
        ),
        (
            "metadata/source-assets.json",
            json_bytes(
                {
                    "assets": [
                        {
                            "path": path,
                            "sha256": REMOTE_MEDIA_ASSETS[asset_name].sha256,
                            "size": len(remote_asset_bytes(asset_name)),
                            "source_url": REMOTE_MEDIA_ASSETS[asset_name].source_url,
                        }
                        for path, asset_name in top_level_media_entries
                    ]
                }
            ),
        ),
        seed_file(
            "parquet/train-00000-of-00001.parquet",
            lambda: make_parquet_bytes("open-media-train", row_count=12000, payload_size=2048),
        ),
        seed_file(
            "parquet/validation-00000-of-00001.parquet",
            lambda: make_parquet_bytes("open-media-validation", row_count=1500, payload_size=1024),
        ),
        # Real HF-sourced parquet so the pure-client preview (issue #27)
        # can be exercised against a file that actually came off the
        # Hugging Face hub, not just locally generated pyarrow output.
        seed_file(
            "fixtures/hf-no-robots-test.parquet",
            lambda: remote_asset_bytes("hf-no-robots-test.parquet"),
        ),
        *(
            seed_file(path, lambda asset_name=asset_name: remote_asset_bytes(asset_name))
            for path, asset_name in top_level_media_entries
        ),
        seed_file("archives/raw-bundle-0000.tar", lambda: archive_bundle()[0]),
        seed_file("archives/raw-bundle-0000.json", lambda: archive_bundle()[1]),
    )

    model_files: tuple[SeedFile, ...] = (
        (
            "README.md",
            text_bytes(
                """
                ---
                license: apache-2.0
                library_name: transformers
                pipeline_tag: image-text-to-text
                tags:
                  - multimodal
                  - sharded-weights
                  - local-dev
                ---

                # vision-language-assistant-3b

                Local multimodal checkpoint with real sharded safetensors weights,
                tokenizer assets, and processor configs.
                """
            ),
        ),
        (
            "config.json",
            json_bytes(
                {
                    "architectures": ["LlavaForConditionalGeneration"],
                    "hidden_size": 3072,
                    "max_position_embeddings": 8192,
                    "model_type": "llava",
                    "num_hidden_layers": 28,
                    "torch_dtype": "bfloat16",
                    "vocab_size": 128256,
                }
            ),
        ),
        (
            "generation_config.json",
            json_bytes(
                {
                    "do_sample": False,
                    "max_new_tokens": 512,
                    "temperature": 0.2,
                    "top_p": 0.9,
                }
            ),
        ),
        (
            "preprocessor_config.json",
            json_bytes(
                {
                    "crop_size": 448,
                    "do_center_crop": True,
                    "do_normalize": True,
                    "image_mean": [0.48145466, 0.4578275, 0.40821073],
                    "image_std": [0.26862954, 0.26130258, 0.27577711],
                }
            ),
        ),
        (
            "processor_config.json",
            json_bytes(
                {
                    "chat_template": "chat_template.jinja",
                    "image_processor_type": "CLIPImageProcessor",
                    "processor_class": "AutoProcessor",
                    "tokenizer_class": "PreTrainedTokenizerFast",
                }
            ),
        ),
        (
            "special_tokens_map.json",
            json_bytes(
                {
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                    "image_token": "<image>",
                    "pad_token": "<pad>",
                }
            ),
        ),
        (
            "tokenizer_config.json",
            json_bytes(
                {
                    "add_bos_token": True,
                    "chat_template": "{% for message in messages %}{{ message['role'] }}: {{ message['content'] }}{% endfor %}",
                    "legacy": False,
                    "model_max_length": 8192,
                    "padding_side": "right",
                }
            ),
        ),
        (
            "tokenizer.json",
            json_bytes(
                {
                    "added_tokens": [{"content": "<image>", "id": 128000}],
                    "normalizer": {"type": "NFKC"},
                    "pre_tokenizer": {"type": "ByteLevel"},
                    "version": "1.0",
                }
            ),
        ),
        (
            "chat_template.jinja",
            text_bytes(
                "{{ bos_token }}{% for message in messages %}{{ message['role'] }}: {{ message['content'] }}{% endfor %}{{ eos_token }}"
            ),
        ),
        (
            "README.weights.md",
            text_bytes(
                """
                # Weight Layout

                The checkpoint is intentionally sharded into valid safetensors files so
                local LFS upload, download, and tree views can exercise a few hundred
                megabytes of realistic model payloads.
                """
            ),
        ),
        seed_file(
            "model.safetensors.index.json",
            lambda: model_bundle()["model.safetensors.index.json"],
        ),
        seed_file(
            "model-00001-of-00003.safetensors",
            lambda: model_bundle()["model-00001-of-00003.safetensors"],
        ),
        seed_file(
            "model-00002-of-00003.safetensors",
            lambda: model_bundle()["model-00002-of-00003.safetensors"],
        ),
        seed_file(
            "model-00003-of-00003.safetensors",
            lambda: model_bundle()["model-00003-of-00003.safetensors"],
        ),
        # Real HF-sourced safetensors (tiny-random-bert, ~520 KB) so the
        # pure-client preview (issue #27) can be exercised against a file
        # that actually came off the Hugging Face hub, not just locally
        # generated safetensors.numpy.save output.
        seed_file(
            "fixtures/hf-tiny-random-bert.safetensors",
            lambda: remote_asset_bytes("hf-tiny-random-bert.safetensors"),
        ),
    )

    return (
        RepoSeed(
            actor="mai_lin",
            repo_type="dataset",
            namespace="open-media-lab",
            name="multimodal-benchmark-suite",
            private=False,
            commits=(
                CommitSeed(
                    summary="Seed multimodal benchmark suite",
                    description=(
                        "Add a real parquet shard, indexed tar archive, and common media "
                        "formats to exercise local dataset browsing and LFS flows."
                    ),
                    files=multimodal_files,
                ),
                CommitSeed(
                    summary="Add archive notes and split manifest",
                    description="Keep the multimodal dataset active with a second commit and metadata refresh.",
                    files=(
                        (
                            "notes/archive-layout.md",
                            text_bytes(
                                """
                                # Archive Layout

                                The indexed tar bundle mirrors the hfutils.index layout so local
                                demos can inspect offsets, file sizes, and per-member checksums.
                                """
                            ),
                        ),
                        (
                            "metadata/splits.json",
                            json_bytes(
                                {
                                    "train": "parquet/train-00000-of-00001.parquet",
                                    "validation": "parquet/validation-00000-of-00001.parquet",
                                }
                            ),
                        ),
                    ),
                ),
            ),
            branch="curation-pass",
            tag="v2026.04-media",
            download_path="parquet/train-00000-of-00001.parquet",
            download_sessions=6,
        ),
        RepoSeed(
            actor="mai_lin",
            repo_type="model",
            namespace="open-media-lab",
            name="vision-language-assistant-3b",
            private=False,
            commits=(
                CommitSeed(
                    summary="Publish sharded multimodal assistant checkpoint",
                    description=(
                        "Add common Hugging Face model files and a few hundred megabytes "
                        "of sharded safetensors weights."
                    ),
                    files=model_files,
                ),
                CommitSeed(
                    summary="Add eval cards and prompt notes",
                    description="Follow-up commit for model history, metadata, and release-note views.",
                    files=(
                        (
                            "eval/benchmark.json",
                            json_bytes(
                                {
                                    "chart_qa_em": 0.71,
                                    "docvqa_anls": 0.63,
                                    "latency_ms_p95": 186,
                                }
                            ),
                        ),
                        (
                            "prompts/system.md",
                            text_bytes(
                                """
                                # System Prompt Notes

                                - prefer grounded answers over speculative OCR recovery
                                - preserve visible numbers and units
                                - mention image regions when ambiguity remains
                                """
                            ),
                        ),
                    ),
                ),
            ),
            branch="eval-refresh",
            tag="v0.9.0-local",
            download_path="model-00001-of-00003.safetensors",
            download_sessions=4,
        ),
        RepoSeed(
            actor="mai_lin",
            repo_type="dataset",
            namespace="open-media-lab",
            name="hierarchy-crawl-fixtures",
            private=False,
            commits=(
                CommitSeed(
                    summary="Seed deeply nested tree fixtures",
                    description=(
                        "Generate a repo with many files and several levels of nested paths "
                        "for tree navigation and search coverage."
                    ),
                    files=make_deep_tree_files("hierarchy-crawl"),
                ),
                CommitSeed(
                    summary="Add tree smoke-test notes",
                    description="Keep one extra commit so history and diff views remain non-trivial.",
                    files=(
                        (
                            "notes/path-review.md",
                            text_bytes(
                                """
                                # Path Review

                                This repo exists to keep large tree browsing reproducible. When a
                                pagination or sorting bug appears, use these fixtures first.
                                """
                            ),
                        ),
                    ),
                ),
            ),
            branch="path-review",
            tag="tree-fixtures-2026-04",
            download_path=(
                "catalog/section-06/tier-08/branch-06/node-06-08-06/"
                "entry-06-08-06.json"
            ),
            download_sessions=2,
        ),
    )


def build_open_media_showcase_repo_seeds() -> tuple[RepoSeed, ...]:
    specs = (
        ("model", "dock-caption-lite", False, "dock captioning smoke-test model"),
        ("dataset", "quay-ops-snippets", False, "operations dataset for list and preview checks"),
        ("space", "repo-browser-demo", False, "space used to pin org landing content"),
        ("model", "layout-distill-small", False, "small layout parser release used for org pages"),
        ("dataset", "table-scan-fixtures", False, "table extraction fixtures for repeated browsing"),
        ("space", "taxonomy-review-room", True, "private review board for annotation changes"),
        ("model", "invoice-embeddings-small", False, "embedding checkpoint metadata fixture"),
        ("dataset", "ui-search-fixtures", False, "search and pagination samples"),
        ("space", "annotation-hotfix-board", True, "private space for triage workflows"),
        ("model", "signal-router-mini", False, "tiny routing model used in showcase cards"),
    )

    repos: list[RepoSeed] = []
    for repo_type, name, private, summary in specs:
        readme = text_bytes(
            f"""
            # {name}

            {summary.capitalize()}.
            This repository exists to give open-media-lab a realistic repo count in local dev.
            """
        )

        if repo_type == "model":
            files: tuple[SeedFile, ...] = (
                ("README.md", readme),
                (
                    "config.json",
                    json_bytes(
                        {
                            "hidden_size": 768,
                            "model_type": name,
                            "num_hidden_layers": 12,
                        }
                    ),
                ),
                seed_file(
                    f"weights/{name}.safetensors",
                    lambda name=name: make_single_checkpoint_bytes(
                        name,
                        (
                            ("model.embed_tokens.weight", (2048, 1024)),
                            ("model.layers.0.mlp.up_proj.weight", (1024, 512)),
                        ),
                    ),
                ),
            )
            download_path = f"weights/{name}.safetensors"
        elif repo_type == "dataset":
            files = (
                ("README.md", readme),
                (
                    "data/rows.jsonl",
                    jsonl_bytes(
                        (
                            {"id": f"{name}-0001", "label": "alpha"},
                            {"id": f"{name}-0002", "label": "beta"},
                        )
                    ),
                ),
                (
                    "metadata/features.json",
                    json_bytes({"id": "string", "label": "string"}),
                ),
            )
            download_path = "data/rows.jsonl"
        else:
            files = (
                ("README.md", readme),
                (
                    "app.py",
                    text_bytes(
                        f"""
                        import gradio as gr

                        demo = gr.Interface(
                            fn=lambda text: "{name}: " + text.strip(),
                            inputs=gr.Textbox(label="Input"),
                            outputs=gr.Textbox(label="Output"),
                            title="{name}",
                        )

                        if __name__ == "__main__":
                            demo.launch()
                        """
                    ),
                ),
                ("requirements.txt", text_bytes("gradio>=4.44.0")),
            )
            download_path = "README.md"

        repos.append(
            RepoSeed(
                actor="mai_lin",
                repo_type=repo_type,
                namespace="open-media-lab",
                name=name,
                private=private,
                commits=(
                    CommitSeed(
                        summary=f"Seed {name}",
                        description="Create a compact org repo so the listing page has real density.",
                        files=files,
                    ),
                ),
                download_path=download_path,
                download_sessions=1 if not private else 0,
            )
        )

    return tuple(repos)


REPO_SEEDS = (
    build_repo_seeds()
    + build_open_media_core_repo_seeds()
    + build_open_media_showcase_repo_seeds()
)

LIKES: tuple[tuple[str, str, str, str], ...] = (
    ("leo_park", "model", "mai_lin", "lineart-caption-base"),
    ("leo_park", "dataset", "mai_lin", "street-sign-zh-en"),
    ("leo_park", "model", "harbor-vision", "marine-seg-small"),
    ("sara_chen", "model", "mai_lin", "lineart-caption-base"),
    ("sara_chen", "model", "aurora-labs", "aurora-ocr-lite"),
    ("sara_chen", "dataset", "aurora-labs", "receipt-layout-bench"),
    ("noah_kim", "model", "aurora-labs", "aurora-ocr-lite"),
    ("noah_kim", "dataset", "mai_lin", "street-sign-zh-en"),
    ("noah_kim", "space", "leo_park", "formula-checker-lite"),
    ("ivy_ops", "model", "mai_lin", "lineart-caption-base"),
    ("ivy_ops", "model", "aurora-labs", "aurora-ocr-lite"),
    ("ivy_ops", "dataset", "sara_chen", "invoice-entities-mini"),
    ("mai_lin", "model", "harbor-vision", "marine-seg-small"),
    ("mai_lin", "space", "leo_park", "formula-checker-lite"),
    ("mai_lin", "dataset", "aurora-labs", "receipt-layout-bench"),
)

# Global fallback sources installed via the admin API so a fresh local seed can
# resolve public HuggingFace repos out-of-the-box. Namespace "" = global scope.
FALLBACK_SOURCE_SEEDS: tuple[dict, ...] = (
    {
        "namespace": "",
        "url": "https://huggingface.co",
        "token": None,
        "priority": 1000,
        "name": "HuggingFace",
        "source_type": "huggingface",
        "enabled": True,
    },
)


def account_index() -> dict[str, AccountSeed]:
    return {account.username: account for account in ACCOUNTS}


def repo_slug(repo: RepoSeed) -> str:
    return f"{repo.repo_type}-{repo.namespace}-{repo.name}".replace("/", "-")


def make_avatar_bytes(label: str, background: str, accent: str) -> bytes:
    image = Image.new("RGB", (512, 512), background)
    draw = ImageDraw.Draw(image)

    draw.rounded_rectangle((48, 48, 464, 464), radius=96, outline=accent, width=16)
    draw.ellipse((120, 120, 392, 392), fill=accent)

    initials = "".join(part[0].upper() for part in label.replace("-", " ").split()[:2])
    font = ImageFont.load_default()
    text_box = draw.textbbox((0, 0), initials, font=font)
    text_width = text_box[2] - text_box[0]
    text_height = text_box[3] - text_box[1]
    draw.text(
        ((512 - text_width) / 2, (512 - text_height) / 2),
        initials,
        fill=background,
        font=font,
    )

    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    return buffer.getvalue()


def describe_error(response: httpx.Response) -> str:
    try:
        payload = response.json()
    except Exception:
        payload = response.text
    return f"HTTP {response.status_code}: {payload}"


async def ensure_response(
    response: httpx.Response,
    action: str,
    allowed_statuses: tuple[int, ...] = (200,),
) -> httpx.Response:
    if response.status_code not in allowed_statuses:
        raise SeedError(f"{action} failed with {describe_error(response)}")
    return response


def url_to_internal_path(url: str) -> str:
    parsed = urlsplit(url)
    path = parsed.path or "/"
    if parsed.query:
        path = f"{path}?{parsed.query}"
    return path


def manifest_matches_current_seed() -> bool:
    if not MANIFEST_PATH.exists():
        return False

    try:
        payload = json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
    except Exception:
        return False

    return payload.get("seed_version") == SEED_VERSION


def representative_seed_repositories() -> tuple[RepoSeed, ...]:
    seen_types: set[str] = set()
    selected: list[RepoSeed] = []

    for repo in REPO_SEEDS:
        if repo.private or repo.repo_type in seen_types:
            continue
        seen_types.add(repo.repo_type)
        selected.append(repo)

    return tuple(selected)


async def detect_seed_state(client: httpx.AsyncClient) -> str:
    response = await client.get(
        f"/api/users/{PRIMARY_USERNAME}/type",
        params={"fallback": "false"},
    )
    if response.status_code == 404:
        return "missing"
    await ensure_response(response, f"check existing seed for {PRIMARY_USERNAME}")

    if not manifest_matches_current_seed():
        return "incomplete"

    for repo in representative_seed_repositories():
        info_response = await client.get(f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}")
        if info_response.status_code == 404:
            return "incomplete"
        await ensure_response(
            info_response,
            f"verify seeded repo metadata for {repo.namespace}/{repo.name}",
        )

        tree_response = await client.get(
            f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/tree/main"
        )
        if tree_response.status_code == 404:
            return "incomplete"
        await ensure_response(
            tree_response,
            f"verify seeded repo storage for {repo.namespace}/{repo.name}",
        )

    return "ready"


async def register_account(client: httpx.AsyncClient, account: AccountSeed) -> None:
    response = await client.post(
        "/api/auth/register",
        json={
            "username": account.username,
            "email": account.email,
            "password": DEFAULT_PASSWORD,
        },
    )
    if response.status_code == 200:
        return

    if response.status_code == 400:
        message = str(response.json())
        if "exists" in message or "conflicts" in message:
            return

    raise SeedError(f"register {account.username} failed with {describe_error(response)}")


async def login_account(client: httpx.AsyncClient, account: AccountSeed) -> None:
    response = await client.post(
        "/api/auth/login",
        json={"username": account.username, "password": DEFAULT_PASSWORD},
    )
    await ensure_response(response, f"login {account.username}")

    if "session_id" not in client.cookies:
        raise SeedError(f"login {account.username} did not set a session cookie")


async def upload_avatar(
    client: httpx.AsyncClient,
    path: str,
    label: str,
    background: str,
    accent: str,
) -> None:
    response = await client.post(
        path,
        files={
            "file": (
                f"{label}.png",
                make_avatar_bytes(label, background, accent),
                "image/png",
            )
        },
    )
    await ensure_response(response, f"upload avatar for {label}")


async def configure_user_profile(client: httpx.AsyncClient, account: AccountSeed) -> None:
    response = await client.put(
        f"/api/users/{account.username}/settings",
        json={
            "email": account.email,
            "full_name": account.full_name,
            "bio": account.bio,
            "website": account.website,
            "social_media": account.social_media,
        },
    )
    await ensure_response(response, f"update user settings for {account.username}")
    await upload_avatar(
        client,
        f"/api/users/{account.username}/avatar",
        account.username,
        account.avatar_bg,
        account.avatar_accent,
    )


def admin_headers() -> dict[str, str]:
    return {"X-Admin-Token": cfg.admin.secret_token}


async def ensure_fallback_source(
    client: httpx.AsyncClient, source: dict
) -> None:
    list_response = await client.get(
        "/admin/api/fallback-sources",
        params={"namespace": source["namespace"]},
        headers=admin_headers(),
    )
    await ensure_response(
        list_response,
        f"list fallback sources for namespace={source['namespace']!r}",
    )

    normalized_url = source["url"].rstrip("/")
    for existing in list_response.json():
        if existing["url"].rstrip("/") == normalized_url:
            return

    create_response = await client.post(
        "/admin/api/fallback-sources",
        json=source,
        headers=admin_headers(),
    )
    await ensure_response(
        create_response,
        f"create fallback source {source['name']} ({normalized_url})",
    )


async def create_organization(
    client: httpx.AsyncClient, organization: OrganizationSeed
) -> None:
    response = await client.post(
        "/org/create",
        json={
            "name": organization.name,
            "description": organization.description,
        },
    )
    if response.status_code == 200:
        return

    if response.status_code == 400 and "already exists" in str(response.json()):
        return

    raise SeedError(
        f"create organization {organization.name} failed with {describe_error(response)}"
    )


async def ensure_org_member(
    client: httpx.AsyncClient,
    org_name: str,
    username: str,
    role: str,
) -> None:
    response = await client.post(
        f"/org/{org_name}/members",
        json={"username": username, "role": role},
    )
    if response.status_code not in (200, 400):
        raise SeedError(
            f"add {username} to {org_name} failed with {describe_error(response)}"
        )

    # PUT keeps roles deterministic even if the member already existed.
    response = await client.put(
        f"/org/{org_name}/members/{username}",
        json={"role": role},
    )
    await ensure_response(response, f"set role for {username} in {org_name}")


async def configure_organization(
    client: httpx.AsyncClient, organization: OrganizationSeed
) -> None:
    response = await client.put(
        f"/api/organizations/{organization.name}/settings",
        json={
            "description": organization.description,
            "bio": organization.bio,
            "website": organization.website,
            "social_media": organization.social_media,
        },
    )
    await ensure_response(response, f"update organization settings for {organization.name}")
    await upload_avatar(
        client,
        f"/api/organizations/{organization.name}/avatar",
        organization.name,
        organization.avatar_bg,
        organization.avatar_accent,
    )


async def create_repo(client: httpx.AsyncClient, repo: RepoSeed) -> None:
    payload = {
        "type": repo.repo_type,
        "name": repo.name,
        "private": repo.private,
    }
    if repo.namespace != repo.actor:
        payload["organization"] = repo.namespace

    response = await client.post("/api/repos/create", json=payload)
    if response.status_code == 200:
        return

    if response.status_code == 400 and "already exists" in str(response.json()):
        return

    raise SeedError(f"create repo {repo.namespace}/{repo.name} failed with {describe_error(response)}")


async def upload_lfs_object(
    client: httpx.AsyncClient,
    repo: RepoSeed,
    content: bytes,
) -> tuple[str, int]:
    oid = hashlib.sha256(content).hexdigest()
    size = len(content)

    response = await client.post(
        f"/{repo.repo_type}s/{repo.namespace}/{repo.name}.git/info/lfs/objects/batch",
        json={
            "operation": "upload",
            "transfers": ["basic"],
            "objects": [{"oid": oid, "size": size}],
            "hash_algo": "sha256",
            # Local dev uses the frontend base_url publicly, so the seed script rewrites
            # verify URLs back onto the in-process backend transport.
            "is_browser": True,
        },
    )
    await ensure_response(response, f"prepare LFS upload for {repo.namespace}/{repo.name}")

    batch_data = response.json()
    obj = batch_data["objects"][0]
    if obj.get("error"):
        raise SeedError(f"LFS batch returned an error for {repo.namespace}/{repo.name}: {obj['error']}")

    upload_action = (obj.get("actions") or {}).get("upload")
    if upload_action:
        upload_headers = upload_action.get("header") or {}
        async with httpx.AsyncClient(follow_redirects=False, timeout=60.0) as network_client:
            upload_response = await network_client.put(
                upload_action["href"],
                content=content,
                headers=upload_headers,
            )

        if upload_response.status_code not in (200, 201):
            raise SeedError(
                f"LFS upload failed for {repo.namespace}/{repo.name}: "
                f"HTTP {upload_response.status_code} {upload_response.text}"
            )

        verify_action = (obj.get("actions") or {}).get("verify")
        if verify_action:
            verify_response = await client.post(
                url_to_internal_path(verify_action["href"]),
                json={"oid": oid, "size": size},
            )
            await ensure_response(
                verify_response,
                f"verify LFS upload for {repo.namespace}/{repo.name}",
            )

    return oid, size


async def commit_files(
    client: httpx.AsyncClient,
    repo: RepoSeed,
    commit: CommitSeed,
) -> None:
    materialized_files = [materialize_seed_file(file_entry) for file_entry in commit.files]
    metadata = []

    for path, content in materialized_files:
        sha256 = hashlib.sha256(content).hexdigest()
        metadata.append(
            {
                "path": path,
                "size": len(content),
                "sha256": sha256,
            }
        )

    preupload_response = await client.post(
        f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/preupload/main",
        json={"files": metadata},
    )
    await ensure_response(
        preupload_response,
        f"preupload {repo.namespace}/{repo.name}",
    )
    preupload_results = {
        item["path"]: item for item in preupload_response.json().get("files", [])
    }

    ndjson_lines = [
        {
            "key": "header",
            "value": {
                "summary": commit.summary,
                "description": commit.description,
            },
        }
    ]

    for path, content in materialized_files:
        mode = preupload_results[path]["uploadMode"]

        if preupload_results[path]["shouldIgnore"]:
            continue

        if mode == "lfs":
            oid, size = await upload_lfs_object(client, repo, content)
            ndjson_lines.append(
                {
                    "key": "lfsFile",
                    "value": {
                        "path": path,
                        "oid": oid,
                        "size": size,
                        "algo": "sha256",
                    },
                }
            )
            continue

        ndjson_lines.append(
            {
                "key": "file",
                "value": {
                    "path": path,
                    "content": base64.b64encode(content).decode("ascii"),
                    "encoding": "base64",
                },
            }
        )

    ndjson_payload = "\n".join(json.dumps(line, sort_keys=True) for line in ndjson_lines)
    response = await client.post(
        f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/commit/main",
        content=ndjson_payload,
        headers={"Content-Type": "application/x-ndjson"},
    )
    await ensure_response(response, f"commit {repo.namespace}/{repo.name}")


async def create_branch(client: httpx.AsyncClient, repo: RepoSeed) -> None:
    if not repo.branch:
        return

    response = await client.post(
        f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/branch",
        json={"branch": repo.branch, "revision": "main"},
    )
    if response.status_code == 200:
        return

    if response.status_code in (400, 409) and "already exists" in str(response.json()):
        return

    raise SeedError(
        f"create branch {repo.branch} for {repo.namespace}/{repo.name} failed with "
        f"{describe_error(response)}"
    )


async def create_tag(client: httpx.AsyncClient, repo: RepoSeed) -> None:
    if not repo.tag:
        return

    response = await client.post(
        f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/tag",
        json={"tag": repo.tag, "revision": "main"},
    )
    if response.status_code == 200:
        return

    if response.status_code in (400, 409) and "already exists" in str(response.json()):
        return

    raise SeedError(
        f"create tag {repo.tag} for {repo.namespace}/{repo.name} failed with "
        f"{describe_error(response)}"
    )


async def like_repo(
    client: httpx.AsyncClient,
    repo_type: str,
    namespace: str,
    name: str,
) -> None:
    response = await client.post(f"/api/{repo_type}s/{namespace}/{name}/like")
    if response.status_code == 200:
        return

    if response.status_code == 400 and "already liked" in str(response.json()):
        return

    raise SeedError(
        f"like {repo_type}/{namespace}/{name} failed with {describe_error(response)}"
    )


async def trigger_download(
    client: httpx.AsyncClient,
    repo: RepoSeed,
    path: str,
    *,
    cookies: dict[str, str] | None = None,
) -> None:
    response = await client.get(
        f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/resolve/main/{path}",
        cookies=cookies,
    )
    if response.status_code not in (302, 307):
        raise SeedError(
            f"download seed for {repo.namespace}/{repo.name}:{path} failed with "
            f"{describe_error(response)}"
        )


def build_manifest() -> dict:
    return {
        "seed_version": SEED_VERSION,
        "manifest_path": str(MANIFEST_PATH),
        "main_ui_url": cfg.app.base_url,
        "backend_url": INTERNAL_BASE_URL,
        "main_login": {
            "username": PRIMARY_USERNAME,
            "password": DEFAULT_PASSWORD,
        },
        "additional_users": [
            {
                "username": account.username,
                "password": DEFAULT_PASSWORD,
                "email": account.email,
            }
            for account in ACCOUNTS
            if account.username != PRIMARY_USERNAME
        ],
        "admin_ui": {
            "url": "http://127.0.0.1:5174",
            "token": cfg.admin.secret_token,
        },
        "organizations": [
            {
                "name": organization.name,
                "members": [
                    {"username": username, "role": role}
                    for username, role in organization.members
                ],
            }
            for organization in ORGANIZATIONS
        ],
        "repositories": [
            {
                "type": repo.repo_type,
                "namespace": repo.namespace,
                "name": repo.name,
                "private": repo.private,
            }
            for repo in REPO_SEEDS
        ],
        "fallback_sources": [
            {
                "namespace": source["namespace"],
                "url": source["url"].rstrip("/"),
                "name": source["name"],
                "source_type": source["source_type"],
                "priority": source["priority"],
            }
            for source in FALLBACK_SOURCE_SEEDS
        ],
    }


def write_manifest() -> None:
    MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True)
    MANIFEST_PATH.write_text(
        json.dumps(build_manifest(), indent=2, sort_keys=True) + "\n",
        encoding="utf-8",
    )


def print_summary(seed_applied: bool) -> None:
    state = "Seeded" if seed_applied else "Seed already present"
    print(f"{state}: {SEED_VERSION}")
    print(f"Manifest: {MANIFEST_PATH}")
    print(f"Main UI: {cfg.app.base_url}")
    print(f"Backend: {INTERNAL_BASE_URL}")
    print(f"Login: {PRIMARY_USERNAME} / {DEFAULT_PASSWORD}")
    print(f"Admin UI token: {cfg.admin.secret_token}")


async def seed_demo_data() -> None:
    init_storage()
    transport = httpx.ASGITransport(app=app)
    accounts_by_name = account_index()

    async with AsyncExitStack() as stack:
        seed_client = await stack.enter_async_context(
            httpx.AsyncClient(
                transport=transport,
                base_url=INTERNAL_BASE_URL,
                follow_redirects=False,
            )
        )

        seed_state = await detect_seed_state(seed_client)
        if seed_state == "ready":
            write_manifest()
            print_summary(seed_applied=False)
            return
        if seed_state == "incomplete":
            raise SeedError(
                "Local demo seed is only partially present. "
                "Run `make reset-local-data` and then retry `make seed-demo`."
            )

        for account in ACCOUNTS:
            await register_account(seed_client, account)

        for fallback_source in FALLBACK_SOURCE_SEEDS:
            await ensure_fallback_source(seed_client, fallback_source)

        authed_clients: dict[str, httpx.AsyncClient] = {}
        for account in ACCOUNTS:
            client = await stack.enter_async_context(
                httpx.AsyncClient(
                    transport=transport,
                    base_url=INTERNAL_BASE_URL,
                    follow_redirects=False,
                )
            )
            await login_account(client, account)
            await configure_user_profile(client, account)
            authed_clients[account.username] = client

        primary_client = authed_clients[PRIMARY_USERNAME]
        for organization in ORGANIZATIONS:
            await create_organization(primary_client, organization)
            for username, role in organization.members:
                if username == PRIMARY_USERNAME:
                    continue
                await ensure_org_member(primary_client, organization.name, username, role)
            await configure_organization(primary_client, organization)

        for repo in REPO_SEEDS:
            repo_client = authed_clients[repo.actor]
            await create_repo(repo_client, repo)
            for commit in repo.commits:
                await commit_files(repo_client, repo, commit)
            await create_branch(repo_client, repo)
            await create_tag(repo_client, repo)

        for liker, repo_type, namespace, name in LIKES:
            await like_repo(authed_clients[liker], repo_type, namespace, name)

        anon_client = await stack.enter_async_context(
            httpx.AsyncClient(
                transport=transport,
                base_url=INTERNAL_BASE_URL,
                follow_redirects=False,
            )
        )

        for repo in REPO_SEEDS:
            if not repo.download_path:
                continue

            if repo.private:
                await trigger_download(
                    authed_clients[PRIMARY_USERNAME],
                    repo,
                    repo.download_path,
                )
                continue

            for session_number in range(repo.download_sessions):
                await trigger_download(
                    anon_client,
                    repo,
                    repo.download_path,
                    cookies={
                        "hf_download_session": f"seed-{repo_slug(repo)}-{session_number:02d}"
                    },
                )

        # Download tracking happens in background tasks off the API response path.
        await asyncio.sleep(0.5)

    write_manifest()
    print_summary(seed_applied=True)


def main() -> int:
    try:
        asyncio.run(seed_demo_data())
    except SeedError as exc:
        print(f"Seed failed: {exc}", file=sys.stderr)
        return 1
    return 0


if __name__ == "__main__":
    raise SystemExit(main())