#!/usr/bin/env python3 """Create deterministic local demo data through KohakuHub's API surface.""" from __future__ import annotations import asyncio import base64 import hashlib import io import json import math import sys import tarfile import tempfile import textwrap from collections.abc import Callable, Iterable from contextlib import AsyncExitStack from dataclasses import dataclass from pathlib import Path from urllib.parse import urlsplit import httpx import numpy as np from PIL import Image, ImageDraw, ImageFont import pyarrow as pa import pyarrow.parquet as pq import requests from hfutils import index as hf_index from safetensors.numpy import save as save_safetensors from seed_shared import SEED_VERSION ROOT_DIR = Path(__file__).resolve().parents[2] SRC_DIR = ROOT_DIR / "src" if str(SRC_DIR) not in sys.path: sys.path.insert(0, str(SRC_DIR)) from kohakuhub.config import cfg from kohakuhub.main import app from kohakuhub.utils.s3 import init_storage DEFAULT_PASSWORD = "KohakuDev123!" PRIMARY_USERNAME = "mai_lin" MANIFEST_PATH = ROOT_DIR / "hub-meta" / "dev" / "demo-seed-manifest.json" INTERNAL_BASE_URL = ( getattr(cfg.app, "internal_base_url", None) or cfg.app.base_url or "http://127.0.0.1:48888" ) class SeedError(RuntimeError): """Raised when demo data creation fails.""" @dataclass(frozen=True) class AccountSeed: username: str email: str full_name: str bio: str website: str social_media: dict[str, str] avatar_bg: str avatar_accent: str @dataclass(frozen=True) class OrganizationSeed: name: str description: str bio: str website: str social_media: dict[str, str] avatar_bg: str avatar_accent: str members: tuple[tuple[str, str], ...] @dataclass(frozen=True) class CommitSeed: summary: str description: str files: tuple["SeedFile", ...] @dataclass(frozen=True) class FileSeed: path: str content: bytes | Callable[[], bytes] @dataclass(frozen=True) class RepoSeed: actor: str repo_type: str namespace: str name: str private: bool commits: tuple[CommitSeed, ...] branch: str | None = None tag: str | None = None download_path: str | None = None download_sessions: int = 0 SeedFile = tuple[str, bytes] | FileSeed @dataclass(frozen=True) class RemoteAsset: cache_name: str url: str sha256: str source_url: str SEED_ASSET_CACHE_DIR = ROOT_DIR / "hub-meta" / "cache" / "seed-assets" ACCOUNTS: tuple[AccountSeed, ...] = ( AccountSeed( username="mai_lin", email="mai.lin@kohakuhub.dev", full_name="Mai Lin", bio=( "Product-minded ML engineer focused on reproducible dataset QA, " "small-model packaging, and local debugging workflows." ), website="https://kohakuhub.local/mai-lin", social_media={ "github": "mai-lin-labs", "huggingface": "mai-lin-labs", "twitter_x": "mai_lin_ops", }, avatar_bg="#183153", avatar_accent="#f59e0b", ), AccountSeed( username="leo_park", email="leo.park@kohakuhub.dev", full_name="Leo Park", bio=( "Frontend-heavy engineer who keeps repo demos honest with browser " "smoke tests and hand-curated example data." ), website="https://kohakuhub.local/leo-park", social_media={ "github": "leo-park-dev", "threads": "leo.park.dev", }, avatar_bg="#0f766e", avatar_accent="#f8fafc", ), AccountSeed( username="sara_chen", email="sara.chen@kohakuhub.dev", full_name="Sara Chen", bio=( "Annotation lead for invoice, receipt, and layout-heavy datasets. " "Prefers clean schemas over magical post-processing." ), website="https://kohakuhub.local/sara-chen", social_media={ "github": "sara-chen-data", "huggingface": "sara-chen-data", }, avatar_bg="#7c2d12", avatar_accent="#fde68a", ), AccountSeed( username="noah_kim", email="noah.kim@kohakuhub.dev", full_name="Noah Kim", bio=( "Ships compact vision models for harbor monitoring, segmentation, " "and camera-side smoke testing." ), website="https://kohakuhub.local/noah-kim", social_media={ "github": "noah-kim-vision", "twitter_x": "noahkimvision", }, avatar_bg="#1d4ed8", avatar_accent="#dbeafe", ), AccountSeed( username="ivy_ops", email="ivy.ops@kohakuhub.dev", full_name="Ivy Ops", bio=( "Release and infra support. Uses stable, boring fixtures so bug " "reports stay reproducible." ), website="https://kohakuhub.local/ivy-ops", social_media={ "github": "ivy-ops", }, avatar_bg="#3f3f46", avatar_accent="#f4f4f5", ), ) ORGANIZATIONS: tuple[OrganizationSeed, ...] = ( OrganizationSeed( name="aurora-labs", description=( "Applied document intelligence team building OCR-friendly models, " "datasets, and lightweight internal tooling." ), bio=( "Aurora Labs curates multilingual OCR assets for receipts, forms, " "and customer-service automation." ), website="https://aurora-labs.kohakuhub.local", social_media={ "github": "aurora-labs", "huggingface": "aurora-labs", }, avatar_bg="#312e81", avatar_accent="#e0e7ff", members=( ("mai_lin", "super-admin"), ("leo_park", "admin"), ("sara_chen", "member"), ("ivy_ops", "visitor"), ), ), OrganizationSeed( name="harbor-vision", description=( "Small computer-vision team for coastal monitoring, dock safety, " "and camera-ready deployment checks." ), bio=( "Harbor Vision maintains compact segmentation and inspection models " "for edge-friendly marine operations." ), website="https://harbor-vision.kohakuhub.local", social_media={ "github": "harbor-vision", "twitter_x": "harborvision", }, avatar_bg="#0f766e", avatar_accent="#ccfbf1", members=( ("mai_lin", "super-admin"), ("noah_kim", "super-admin"), ("leo_park", "visitor"), ), ), ) def build_scale_accounts() -> tuple[AccountSeed, ...]: specs = ( ( "mila_zhou", "Mila Zhou", "Dataset release engineer focused on parquet validation, shard manifests, and large org operations.", "mila-zhou-data", "#4c1d95", "#ede9fe", ), ( "ethan_reed", "Ethan Reed", "Model packaging owner who keeps tokenizer assets, shard indexes, and release notes tidy.", "ethan-reed-models", "#0f766e", "#ccfbf1", ), ( "olivia_hart", "Olivia Hart", "Benchmarks multimodal search pipelines and curates reproducible evaluation bundles.", "olivia-hart-ai", "#9a3412", "#ffedd5", ), ( "liam_north", "Liam North", "Owns local demo QA for file-tree pagination, deep directory browsing, and download flows.", "liam-north-labs", "#1d4ed8", "#dbeafe", ), ( "zoe_park", "Zoe Park", "Keeps audio, image, and video fixtures aligned with product demos and ingestion checks.", "zoe-park-media", "#065f46", "#d1fae5", ), ( "owen_davis", "Owen Davis", "Maintains synthetic but structurally realistic model exports for offline smoke testing.", "owen-davis-ml", "#7c2d12", "#fed7aa", ), ( "mia_cross", "Mia Cross", "Curates metadata-heavy datasets with stable labels and repeatable schema previews.", "mia-cross-data", "#be123c", "#ffe4e6", ), ( "lucas_tan", "Lucas Tan", "Documents retrieval pipelines, indexed archives, and annotation workflows for the team.", "lucas-tan-docs", "#1e3a8a", "#dbeafe", ), ( "ava_scott", "Ava Scott", "Runs browser-first QA against large org listings, search results, and activity views.", "ava-scott-qa", "#854d0e", "#fef3c7", ), ( "jackson_liu", "Jackson Liu", "Tracks media indexing pipelines and long-tail file format regressions.", "jackson-liu-index", "#155e75", "#cffafe", ), ( "grace_hill", "Grace Hill", "Handles org membership operations and permissions reviews for shared demo spaces.", "grace-hill-ops", "#6d28d9", "#ede9fe", ), ( "henry_wu", "Henry Wu", "Maintains multilingual dataset snapshots and local release validation checklists.", "henry-wu-data", "#92400e", "#fef3c7", ), ) return tuple( AccountSeed( username=username, email=f"{username.replace('_', '.')}@kohakuhub.dev", full_name=full_name, bio=bio, website=f"https://kohakuhub.local/{username.replace('_', '-')}", social_media={ "github": github_handle, "huggingface": github_handle, }, avatar_bg=avatar_bg, avatar_accent=avatar_accent, ) for username, full_name, bio, github_handle, avatar_bg, avatar_accent in specs ) SCALE_ACCOUNTS = build_scale_accounts() ACCOUNTS = ACCOUNTS + SCALE_ACCOUNTS OPEN_MEDIA_MEMBERS: tuple[tuple[str, str], ...] = ( ("mai_lin", "super-admin"), ("leo_park", "admin"), ("sara_chen", "admin"), ("ivy_ops", "admin"), ("noah_kim", "member"), ("mila_zhou", "admin"), ("ethan_reed", "member"), ("olivia_hart", "member"), ("liam_north", "member"), ("zoe_park", "member"), ("owen_davis", "member"), ("mia_cross", "member"), ("lucas_tan", "member"), ("ava_scott", "visitor"), ("jackson_liu", "member"), ("grace_hill", "visitor"), ("henry_wu", "member"), ) ORGANIZATIONS = ORGANIZATIONS + ( OrganizationSeed( name="open-media-lab", description=( "Shared local-dev org packed with multimodal fixtures, large repo lists, " "and high-member-count collaboration scenarios." ), bio=( "Open Media Lab maintains reproducible multimodal assets for UI browsing, " "download tracking, metadata QA, and repository management demos." ), website="https://open-media-lab.kohakuhub.local", social_media={ "github": "open-media-lab", "huggingface": "open-media-lab", }, avatar_bg="#0f172a", avatar_accent="#bae6fd", members=OPEN_MEDIA_MEMBERS, ), ) SAFEBOORU_IMAGE_ASSETS: tuple[RemoteAsset, ...] = ( RemoteAsset( cache_name="safebooru-canal-reflections.png", url="https://cdn.donmai.us/original/79/a6/79a6c565714b36c5689131085d70a8a2.png", sha256="4b0b07d9f6d2658346525326567f4db7aebeae8b2ade4facb0f56f9972bdb669", source_url="https://safebooru.donmai.us/posts/11208212", ), RemoteAsset( cache_name="safebooru-mountain-church.jpg", url="https://cdn.donmai.us/original/dc/d4/dcd4a809e6efc402363720a6714bc4f7.jpg", sha256="a688df893449c757d979ff877aa1a3f006de649686ed0f5b101e807808e1dbc7", source_url="https://safebooru.donmai.us/posts/11207803", ), RemoteAsset( cache_name="safebooru-sand-plain.jpg", url="https://cdn.donmai.us/original/e8/20/e8201ebfcf9802fd5b74f126ae501406.jpg", sha256="14420b7849ab8922914d2ccc5d32abbf25ae26642ea50dfbb15096a8d9e85503", source_url="https://safebooru.donmai.us/posts/11207788", ), RemoteAsset( cache_name="safebooru-fence-field.jpg", url="https://cdn.donmai.us/original/5d/28/5d2833c4731c2b8631eefe5f89cd2541.jpg", sha256="e7eec10df1393ee661da300612b84cc4b0f8052d54aae4244cddaaaeb50a3d79", source_url="https://safebooru.donmai.us/posts/11207775", ), RemoteAsset( cache_name="safebooru-forest-lake.jpg", url="https://cdn.donmai.us/original/08/33/08330cb79116cd7dd1000f702b28c4f3.jpg", sha256="565520f058666a04953a1cbc8db67b2687fde240bb26b29d9b1008f562d78aa6", source_url="https://safebooru.donmai.us/posts/11207641", ), RemoteAsset( cache_name="safebooru-fantasy-castle.jpg", url="https://cdn.donmai.us/original/31/45/3145abe70177f3d01150a8fa9aa692dc.jpg", sha256="1d52643e22021364650176ff5c47e70ee101020f3329f9cd1f44b9aad739737a", source_url="https://safebooru.donmai.us/posts/11207593", ), RemoteAsset( cache_name="safebooru-phainon-cyrene.jpg", url=( "https://cdn.donmai.us/original/29/82/" "__phainon_and_cyrene_honkai_and_1_more_drawn_by_whyte_srsn__" "298282d12b00b563a09bebb65cc11116.jpg" ), sha256="8c8e04d47dea6ba020c6f0ec96932aaf760101b1cd358ba6eb829aa908f52b2f", source_url="https://safebooru.donmai.us/posts/9740876", ), RemoteAsset( cache_name="safebooru-sunflower-field.png", url=( "https://cdn.donmai.us/original/65/dd/" "__shirakami_fubuki_hololive_drawn_by_hyde_tabakko__" "65ddfa390ca539e6f9ed9658d65c77c4.png" ), sha256="c6a157e11758d8b1584502f772f1300c2a0b9e00ba7d9d883fd6b24b247181c0", source_url="https://safebooru.donmai.us/posts/9779697", ), RemoteAsset( cache_name="safebooru-grass-wonder.jpg", url=( "https://cdn.donmai.us/original/f9/5f/" "__grass_wonder_umamusume_and_1_more_drawn_by_fuuseppu__" "f95f1c3cdc9e69d9f2de613dc8117df2.jpg" ), sha256="35d08757090287d2fa465cc7ab959829b3df03c18e254580fc6ecbb8dc1cb118", source_url="https://safebooru.donmai.us/posts/9658576", ), RemoteAsset( cache_name="safebooru-paper-boat.jpg", url=( "https://cdn.donmai.us/original/f2/66/" "__sameko_saba_indie_virtual_youtuber_drawn_by_sky_above_me__" "f2664dc9d6a90473cf49234a3f30bea1.jpg" ), sha256="ae20506f36504895708fe1c85979c1dede228571044457bd5e91daaa1415ce7e", source_url="https://safebooru.donmai.us/posts/9599213", ), ) REMOTE_MEDIA_ASSETS: dict[str, RemoteAsset] = { asset.cache_name: asset for asset in ( *SAFEBOORU_IMAGE_ASSETS, RemoteAsset( cache_name="voices-speech.wav", url=( "https://download.pytorch.org/torchaudio/tutorial-assets/" "Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" ), sha256="c65fcd726d6b08c82c1e5dc7558f863cd8d483e3ed2f4a7bcf271dc1865ada14", source_url=( "https://download.pytorch.org/torchaudio/tutorial-assets/" "Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" ), ), RemoteAsset( cache_name="steam-train-whistle.wav", url=( "https://download.pytorch.org/torchaudio/tutorial-assets/" "steam-train-whistle-daniel_simon.wav" ), sha256="762b6783be7f20aa8be03812eeb33184bb5b1497db7422607a70b5d441fc45e9", source_url=( "https://download.pytorch.org/torchaudio/tutorial-assets/" "steam-train-whistle-daniel_simon.wav" ), ), RemoteAsset( cache_name="opencv-vtest.avi", url="https://raw.githubusercontent.com/opencv/opencv/4.x/samples/data/vtest.avi", sha256="45cddc9490be69345cbdab64ca583be65987e864ca408038e648db99e10516cf", source_url="https://github.com/opencv/opencv/blob/4.x/samples/data/vtest.avi", ), # Real HF-hosted fixtures used to exercise the pure-client preview # path (issue #27). Both files are small (~500 KB each), pinned by # sha256, and sourced from long-stable public HF test artifacts so # the seed stays deterministic across runs. RemoteAsset( cache_name="hf-tiny-random-bert.safetensors", url="https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/model.safetensors", sha256="965f02b6a7e5520fc12f710e4e3b6132f697f1c8f648819553c5ade86752d2de", source_url="https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/model.safetensors", ), RemoteAsset( cache_name="hf-no-robots-test.parquet", url="https://huggingface.co/datasets/HuggingFaceH4/no_robots/resolve/main/data/test-00000-of-00001.parquet", sha256="60707b2636a46e37bb0c1e9ca263a18553f430317b7a53c691676d6a492fc0f2", source_url="https://huggingface.co/datasets/HuggingFaceH4/no_robots/blob/main/data/test-00000-of-00001.parquet", ), ) } def text_bytes(body: str) -> bytes: return (textwrap.dedent(body).strip() + "\n").encode("utf-8") def json_bytes(payload: dict | list) -> bytes: return (json.dumps(payload, indent=2, sort_keys=True) + "\n").encode("utf-8") def csv_bytes(rows: Iterable[Iterable[str]]) -> bytes: lines = [",".join(row) for row in rows] return ("\n".join(lines) + "\n").encode("utf-8") def jsonl_bytes(rows: Iterable[dict]) -> bytes: return ("\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n").encode( "utf-8" ) def profile_space_files(title: str, summary: str, accent: str) -> tuple[tuple[str, bytes], ...]: return ( ( "README.md", text_bytes( f""" --- title: {title} emoji: "\u2605" colorFrom: indigo colorTo: amber sdk: gradio sdk_version: "4.44.0" --- # {title} {summary} This space exists so local profile pages render with realistic content instead of an empty placeholder repository. """ ), ), ( "app.py", text_bytes( f""" import gradio as gr demo = gr.Interface( fn=lambda text: "{title}: " + text.strip(), inputs=gr.Textbox(label="Prompt"), outputs=gr.Textbox(label="Response"), title="{title}", description="{summary}", theme=gr.themes.Soft(primary_hue="{accent}"), ) if __name__ == "__main__": demo.launch() """ ), ), ("requirements.txt", text_bytes("gradio>=4.44.0")), ) def seed_file(path: str, content: bytes | Callable[[], bytes]) -> FileSeed: return FileSeed(path=path, content=content) def materialize_seed_file(file_entry: SeedFile) -> tuple[str, bytes]: if isinstance(file_entry, FileSeed): content = file_entry.content() if callable(file_entry.content) else file_entry.content return file_entry.path, content return file_entry _ASSET_BYTES_CACHE: dict[str, bytes] = {} def patterned_bytes(label: str, size_bytes: int, *, header: bytes = b"") -> bytes: if size_bytes <= len(header): return header[:size_bytes] pattern = bytearray() counter = 0 while len(pattern) < 4096: pattern.extend(hashlib.sha256(f"{label}:{counter}".encode("utf-8")).digest()) counter += 1 body_size = size_bytes - len(header) repeated = (bytes(pattern) * math.ceil(body_size / len(pattern)))[:body_size] return header + repeated def sha256_hex(data: bytes) -> str: return hashlib.sha256(data).hexdigest() def fetch_remote_asset(asset: RemoteAsset) -> bytes: cached = _ASSET_BYTES_CACHE.get(asset.cache_name) if cached is not None: return cached cache_path = SEED_ASSET_CACHE_DIR / asset.cache_name if cache_path.is_file(): data = cache_path.read_bytes() if sha256_hex(data) == asset.sha256: _ASSET_BYTES_CACHE[asset.cache_name] = data return data cache_path.parent.mkdir(parents=True, exist_ok=True) response = requests.get( asset.url, timeout=180, headers={"User-Agent": "KohakuHubLocalSeed/1.0"}, ) response.raise_for_status() data = response.content actual_sha256 = sha256_hex(data) if actual_sha256 != asset.sha256: raise SeedError( f"Remote asset hash mismatch for {asset.cache_name}: " f"expected {asset.sha256}, got {actual_sha256}" ) tmp_path = cache_path.with_suffix(f"{cache_path.suffix}.part") tmp_path.write_bytes(data) tmp_path.replace(cache_path) _ASSET_BYTES_CACHE[asset.cache_name] = data return data def remote_asset_bytes(asset_name: str) -> bytes: return fetch_remote_asset(REMOTE_MEDIA_ASSETS[asset_name]) def make_realistic_float16_tensor(label: str, shape: tuple[int, ...]) -> np.ndarray: element_count = math.prod(shape) raw_values = np.frombuffer(patterned_bytes(label, element_count * 2), dtype=" tuple[bytes, int]: tensors: dict[str, np.ndarray] = {} total_tensor_bytes = 0 for tensor_name, shape in tensor_specs: tensor = make_realistic_float16_tensor(f"{label}:{tensor_name}", shape) tensors[tensor_name] = tensor total_tensor_bytes += tensor.nbytes payload = save_safetensors( tensors, metadata={ "format": "pt", "seed_label": label, **(metadata or {}), }, ) return payload, total_tensor_bytes def make_single_checkpoint_bytes( label: str, tensor_specs: tuple[tuple[str, tuple[int, ...]], ...], ) -> bytes: payload, _ = make_safetensors_bytes(label, tensor_specs) return payload def make_parquet_bytes( label: str, *, row_count: int = 12000, payload_size: int = 2048, ) -> bytes: base_payload = patterned_bytes(f"{label}-payload", payload_size) payloads = [] sample_ids = [] captions = [] durations = [] for row_index in range(row_count): prefix = f"{label}:{row_index:05d}|".encode("utf-8") payloads.append(prefix + base_payload[: payload_size - len(prefix)]) sample_ids.append(f"{label}_{row_index:05d}") captions.append( f"{label} multimodal benchmark row {row_index:05d} for local dataset preview checks." ) durations.append(round(1.5 + (row_index % 11) * 0.25, 3)) table = pa.table( { "sample_id": pa.array(sample_ids, type=pa.string()), "caption": pa.array(captions, type=pa.string()), "duration_seconds": pa.array(durations, type=pa.float32()), "payload": pa.array(payloads, type=pa.binary()), } ) buffer = io.BytesIO() pq.write_table( table, buffer, compression="NONE", use_dictionary=False, row_group_size=512, ) return buffer.getvalue() def make_indexed_tar_bundle( label: str, files: tuple[tuple[str, bytes], ...], ) -> tuple[bytes, bytes]: tar_buffer = io.BytesIO() with tarfile.open(fileobj=tar_buffer, mode="w") as handle: for path, content in files: info = tarfile.TarInfo(name=path) info.size = len(content) info.mode = 0o644 info.mtime = 0 info.uid = 0 info.gid = 0 info.uname = "" info.gname = "" handle.addfile(info, io.BytesIO(content)) tar_bytes = tar_buffer.getvalue() with tempfile.TemporaryDirectory(prefix="kohakuhub-seed-tar-") as tmp_dir: tar_path = Path(tmp_dir) / f"{label}.tar" tar_path.write_bytes(tar_bytes) index_info = hf_index.tar_get_index_info(str(tar_path), silent=True) index_bytes = json_bytes(index_info) return tar_bytes, index_bytes def make_deep_tree_files(label: str) -> tuple[SeedFile, ...]: files: list[SeedFile] = [] for section in range(1, 7): for shard in range(1, 9): for leaf in range(1, 7): path = ( f"catalog/section-{section:02d}/tier-{shard:02d}/" f"branch-{leaf:02d}/node-{section:02d}-{shard:02d}-{leaf:02d}/" f"entry-{section:02d}-{shard:02d}-{leaf:02d}.json" ) files.append( ( path, json_bytes( { "checksum": hashlib.sha256(path.encode("utf-8")).hexdigest(), "fixture": label, "leaf": leaf, "section": section, "shard": shard, } ), ) ) files.extend( ( ( "README.md", text_bytes( """ # hierarchy-crawl-fixtures This repo intentionally contains many files and deep path nesting so local tree browsing, pagination, and search remain easy to exercise. """ ), ), ( "manifests/root-index.json", json_bytes( { "depth": 4, "generated_files": len(files), "label": label, } ), ), ) ) return tuple(files) def build_repo_seeds() -> tuple[RepoSeed, ...]: return ( RepoSeed( actor="mai_lin", repo_type="model", namespace="mai_lin", name="lineart-caption-base", private=False, commits=( CommitSeed( summary="Bootstrap base caption model", description=( "Create the public demo model repo with a realistic README, " "lightweight config, and a small LFS-tracked checkpoint." ), files=( ( "README.md", text_bytes( """ --- license: mit library_name: transformers pipeline_tag: image-to-text tags: - captioning - line-art - document-vision --- # lineart-caption-base A compact caption model tuned for monochrome line art, icon-heavy diagrams, and OCR-adjacent illustrations. ## Intended use - draft captions for internal QA dashboards - generate quick prompts for reviewers - validate frontend metadata rendering """ ), ), ( "config.json", json_bytes( { "architectures": ["VisionEncoderDecoderModel"], "decoder_layers": 6, "encoder_layers": 12, "image_size": 448, "model_type": "lineart-caption-base", "vocab_size": 32000, } ), ), ( "tokenizer.json", json_bytes( { "added_tokens": [], "normalizer": {"type": "NFKC"}, "pre_tokenizer": {"type": "Whitespace"}, "version": "1.0", } ), ), ("examples/prompt.txt", text_bytes("Describe the icon, layout, and visible text.")), seed_file( "checkpoints/lineart-caption-base.safetensors", lambda: make_single_checkpoint_bytes( "lineart-caption-base", ( ( "encoder.vision_model.embeddings.patch_embedding.weight", (4096, 1024), ), ("decoder.model.embed_tokens.weight", (1024, 768)), ), ), ), ), ), CommitSeed( summary="Add eval notes and release metrics", description="Follow-up commit so commit history and file updates are visible in local UI.", files=( ( "README.md", text_bytes( """ --- license: mit library_name: transformers pipeline_tag: image-to-text tags: - captioning - line-art - document-vision --- # lineart-caption-base A compact caption model tuned for monochrome line art, icon-heavy diagrams, and OCR-adjacent illustrations. ## Current release - validation CIDEr: 1.38 - latency target: <120 ms on local A10G - known gap: dense legends still need manual review """ ), ), ( "eval/metrics.json", json_bytes( { "cider": 1.38, "clip_score": 0.284, "latency_ms_p50": 87, "latency_ms_p95": 114, } ), ), ( "docs/training-notes.md", text_bytes( """ # Training Notes - Base corpus: 82k internal line-art render pairs - Additional hard negatives: 4k cluttered signage crops - Checkpoint exported for small-batch browser smoke tests """ ), ), ), ), ), branch="ablation-notes", tag="v0.2.1", download_path="checkpoints/lineart-caption-base.safetensors", download_sessions=4, ), RepoSeed( actor="mai_lin", repo_type="dataset", namespace="mai_lin", name="street-sign-zh-en", private=False, commits=( CommitSeed( summary="Import bilingual street sign dataset", description="Seed a CSV-backed dataset that exercises dataset preview and tree views.", files=( ( "README.md", text_bytes( """ --- license: cc-by-4.0 task_categories: - image-text-to-text language: - zh - en pretty_name: Street Sign ZH EN --- # street-sign-zh-en A small bilingual dataset for OCR-friendly sign translation and layout QA. Rows keep the original text, translation, and scene tag. """ ), ), ( "data/train.csv", csv_bytes( ( ("image", "text_zh", "text_en", "scene"), ("img_0001.png", "\u5317\u4eac\u7ad9", "Beijing Railway Station", "station"), ("img_0002.png", "\u5c0f\u5fc3\u53f0\u9636", "Watch Your Step", "retail"), ("img_0003.png", "\u7981\u6b62\u5438\u70df", "No Smoking", "hospital"), ("img_0004.png", "\u53f3\u8f6c\u8f66\u9053", "Right Turn Only", "road"), ) ), ), ( "data/validation.csv", csv_bytes( ( ("image", "text_zh", "text_en", "scene"), ("val_0001.png", "\u51fa\u53e3", "Exit", "mall"), ("val_0002.png", "\u670d\u52a1\u53f0", "Service Desk", "airport"), ) ), ), ( "metadata/features.json", json_bytes( { "image": "string", "text_zh": "string", "text_en": "string", "scene": "string", } ), ), ), ), CommitSeed( summary="Add preview samples for dataset viewer", description="Include JSONL samples and notebook notes for local bug reproduction.", files=( ( "README.md", text_bytes( """ --- license: cc-by-4.0 task_categories: - image-text-to-text language: - zh - en pretty_name: Street Sign ZH EN --- # street-sign-zh-en A small bilingual dataset for OCR-friendly sign translation and layout QA. Rows keep the original text, translation, and scene tag. ## Notes Validation rows intentionally mix transport, retail, and public service scenarios so sorting and filtering bugs are easier to spot. """ ), ), ( "previews/samples.jsonl", jsonl_bytes( ( { "image": "img_0001.png", "text_zh": "\u5317\u4eac\u7ad9", "text_en": "Beijing Railway Station", "scene": "station", }, { "image": "img_0002.png", "text_zh": "\u5c0f\u5fc3\u53f0\u9636", "text_en": "Watch Your Step", "scene": "retail", }, ) ), ), ( "notebooks/README.md", text_bytes( """ # Notebook Notes This dataset is intentionally tiny in local dev. The point is to exercise preview, pagination, and schema rendering without waiting on a large bootstrap import. """ ), ), ), ), ), branch="qa-pass", tag="2026-04-demo", download_path="data/train.csv", download_sessions=8, ), RepoSeed( actor="mai_lin", repo_type="space", namespace="mai_lin", name="mai_lin", private=False, commits=( CommitSeed( summary="Create profile showcase space", description="Provide a same-name space so local profile pages render a realistic card.", files=profile_space_files( "Mai Lin Workspace", "Small utilities and pinned demos used for local reproduction.", "amber", ), ), CommitSeed( summary="Add profile theme preset", description="A second commit makes the space history non-empty for UI testing.", files=( ( "assets/theme.json", json_bytes( { "accent": "amber", "layout": "split", "panels": ["repos", "activity", "notes"], } ), ), ), ), ), ), RepoSeed( actor="mai_lin", repo_type="dataset", namespace="mai_lin", name="internal-evals", private=True, commits=( CommitSeed( summary="Seed private eval artifacts", description="Keep one private user-owned repo for auth and permission checks.", files=( ( "README.md", text_bytes( """ # internal-evals Private staging area for eval summaries and failure-case review. This repo is intentionally private and only accessible to Mai. """ ), ), ( "runs/2026-04-15-summary.json", json_bytes( { "caption_regressions": 7, "dataset": "street-sign-zh-en", "notes": "False positives cluster around mirrored storefront text.", } ), ), ( "data/failure_cases.jsonl", jsonl_bytes( ( { "file": "eval_001.png", "issue": "mirror_text", "severity": "medium", }, { "file": "eval_002.png", "issue": "crowded_legend", "severity": "high", }, ) ), ), ), ), CommitSeed( summary="Add reviewer checklist", description="Second commit for commit-history coverage on a private repo.", files=( ( "notes/reviewer-checklist.md", text_bytes( """ # Reviewer Checklist - confirm sample renders in dataset viewer - compare translated text against bilingual CSV rows - log UI regressions with the seeded repo name """ ), ), ), ), ), download_path="runs/2026-04-15-summary.json", download_sessions=1, ), RepoSeed( actor="mai_lin", repo_type="space", namespace="aurora-labs", name="aurora-labs", private=False, commits=( CommitSeed( summary="Create org showcase space", description="Same-name org space keeps organization profile pages representative.", files=profile_space_files( "Aurora Labs Demo Portal", "Landing page for OCR demos, pinned datasets, and release notes.", "indigo", ), ), CommitSeed( summary="Add roadmap note", description="A lightweight follow-up commit for org space history.", files=( ( "docs/roadmap.md", text_bytes( """ # Local Demo Roadmap - tighten OCR-lite benchmark reporting - keep receipt-layout-bench labels stable for bug repro - mirror one private support model for permission testing """ ), ), ), ), ), ), RepoSeed( actor="mai_lin", repo_type="model", namespace="aurora-labs", name="aurora-ocr-lite", private=False, commits=( CommitSeed( summary="Publish OCR-lite baseline", description="Public model repo with LFS checkpoint and readable metadata.", files=( ( "README.md", text_bytes( """ --- license: apache-2.0 library_name: transformers pipeline_tag: image-to-text tags: - ocr - receipts - multilingual --- # aurora-ocr-lite An OCR-focused checkpoint for receipt snippets, payment slips, and service counter paperwork. """ ), ), ( "config.json", json_bytes( { "backbone": "vit-small-patch16-384", "decoder": "bart-base", "max_position_embeddings": 512, "torch_dtype": "float16", } ), ), ( "vocab.txt", text_bytes( """ [PAD] [UNK] total subtotal tax cashier paid """ ), ), seed_file( "checkpoints/aurora-ocr-lite.safetensors", lambda: make_single_checkpoint_bytes( "aurora-ocr-lite", ( ("encoder.patch_embed.proj.weight", (6144, 1024)), ("decoder.model.embed_tokens.weight", (2048, 1024)), ), ), ), ), ), CommitSeed( summary="Add benchmark export and release notes", description="Keep one public org model slightly more active for trending and history views.", files=( ( "README.md", text_bytes( """ --- license: apache-2.0 library_name: transformers pipeline_tag: image-to-text tags: - ocr - receipts - multilingual --- # aurora-ocr-lite An OCR-focused checkpoint for receipt snippets, payment slips, and service counter paperwork. ## Release notes - reduced hallucinated currency markers on narrow receipt crops - added benchmark export used by the admin dashboard smoke tests """ ), ), ( "eval/benchmark.json", json_bytes( { "cer": 0.081, "wer": 0.119, "latency_ms_p50": 64, "latency_ms_p95": 92, } ), ), ( "scripts/export_notes.md", text_bytes( """ # Export Notes Checkpoint is intentionally small and fake. It only exists so local flows hit LFS, quota, and file-tree code paths. """ ), ), ), ), ), branch="benchmark-v2", tag="v0.3.0", download_path="checkpoints/aurora-ocr-lite.safetensors", download_sessions=12, ), RepoSeed( actor="leo_park", repo_type="dataset", namespace="aurora-labs", name="receipt-layout-bench", private=False, commits=( CommitSeed( summary="Create receipt layout benchmark", description="Public dataset repo with JSONL splits for dataset preview coverage.", files=( ( "README.md", text_bytes( """ --- license: cc-by-4.0 pretty_name: Receipt Layout Bench task_categories: - token-classification --- # receipt-layout-bench Annotation benchmark for merchant, total, tax, and timestamp spans. """ ), ), ( "splits/train.jsonl", jsonl_bytes( ( { "image": "train_0001.png", "merchant": "North Pier Cafe", "total": "18.40", "currency": "USD", }, { "image": "train_0002.png", "merchant": "River Town Mart", "total": "42.15", "currency": "USD", }, ) ), ), ( "splits/test.jsonl", jsonl_bytes( ( { "image": "test_0001.png", "merchant": "Airport Bento", "total": "9.80", "currency": "USD", }, { "image": "test_0002.png", "merchant": "Harbor Books", "total": "27.10", "currency": "USD", }, ) ), ), ( "schema/fields.json", json_bytes( { "merchant": "string", "total": "string", "currency": "string", "timestamp": "string", } ), ), ), ), CommitSeed( summary="Add annotation guide", description="Second dataset commit for history, tree diffing, and docs rendering.", files=( ( "docs/annotation-guide.md", text_bytes( """ # Annotation Guide - mark printed totals, not handwritten notes - keep currency in a dedicated field - preserve merchant spelling from source image """ ), ), ( "README.md", text_bytes( """ --- license: cc-by-4.0 pretty_name: Receipt Layout Bench task_categories: - token-classification --- # receipt-layout-bench Annotation benchmark for merchant, total, tax, and timestamp spans. The local seed intentionally mixes neat and messy receipts to cover pagination, filters, and table previews. """ ), ), ), ), ), branch="supplier-a-refresh", tag="v1.0.0", download_path="splits/test.jsonl", download_sessions=5, ), RepoSeed( actor="mai_lin", repo_type="model", namespace="aurora-labs", name="customer-support-rag", private=True, commits=( CommitSeed( summary="Seed private support model workspace", description="Private org repo for auth-only browsing and settings checks.", files=( ( "README.md", text_bytes( """ # customer-support-rag Internal-only retrieval and prompt assets for support workflows. This repo is private and visible to Aurora Labs members only. """ ), ), ( "prompt/system.txt", text_bytes( """ You are a cautious support assistant. Answer only with facts from the indexed knowledge base, and cite the exact article title. """ ), ), ( "retrieval/index-schema.json", json_bytes( { "article_id": "string", "channel": "string", "lang": "string", "text": "string", } ), ), ( "config.json", json_bytes( { "chunk_size": 384, "embedding_model": "bge-small-en-v1.5", "top_k": 6, } ), ), ), ), CommitSeed( summary="Add ops runbook", description="Keep a second private-org commit for local history inspection.", files=( ( "docs/runbook.md", text_bytes( """ # Runbook - refresh embeddings weekly - snapshot prompts before frontend demos - record regressions against the fixed local seed data """ ), ), ), ), ), download_path="prompt/system.txt", download_sessions=1, ), RepoSeed( actor="noah_kim", repo_type="model", namespace="harbor-vision", name="marine-seg-small", private=False, commits=( CommitSeed( summary="Publish marine segmentation starter model", description="Public vision model with another fake LFS checkpoint.", files=( ( "README.md", text_bytes( """ --- license: apache-2.0 pipeline_tag: image-segmentation tags: - segmentation - marine - edge --- # marine-seg-small Compact segmentation model for harbor waterlines, safety zones, and dock equipment outlines. """ ), ), ( "config.json", json_bytes( { "backbone": "convnext-tiny", "classes": ["water", "dock", "vessel", "buoy"], "input_size": 512, } ), ), ( "labels.json", json_bytes( { "0": "water", "1": "dock", "2": "vessel", "3": "buoy", } ), ), seed_file( "checkpoints/marine-seg-small.safetensors", lambda: make_single_checkpoint_bytes( "marine-seg-small", ( ("backbone.stem.conv1.weight", (4096, 1536)), ("decode_head.classifier.weight", (1024, 1024)), ), ), ), ), ), CommitSeed( summary="Add harbor evaluation report", description="Second model commit for history and stats coverage.", files=( ( "README.md", text_bytes( """ --- license: apache-2.0 pipeline_tag: image-segmentation tags: - segmentation - marine - edge --- # marine-seg-small Compact segmentation model for harbor waterlines, safety zones, and dock equipment outlines. ## Eval highlights - best IoU on waterline masks from overcast camera feeds - weaker on stacked cargo edges during dusk """ ), ), ( "eval/coastal-harbor.json", json_bytes( { "iou_dock": 0.84, "iou_vessel": 0.79, "iou_water": 0.91, } ), ), ), ), ), branch="saltwater-eval", tag="v1.1.0", download_path="checkpoints/marine-seg-small.safetensors", download_sessions=6, ), RepoSeed( actor="noah_kim", repo_type="space", namespace="harbor-vision", name="smoke-test-dashboard", private=True, commits=( CommitSeed( summary="Create private smoke-test dashboard", description="Private org space used for auth and space rendering checks.", files=( ( "README.md", text_bytes( """ # smoke-test-dashboard Private dashboard for camera ingest smoke tests and deployment sign-off. """ ), ), ( "app.py", text_bytes( """ import gradio as gr dashboard = gr.Interface( fn=lambda status: f"dashboard status: {status}", inputs=gr.Textbox(label="Input"), outputs=gr.Textbox(label="Output"), title="Smoke Test Dashboard", ) if __name__ == "__main__": dashboard.launch() """ ), ), ("requirements.txt", text_bytes("gradio>=4.44.0")), ), ), CommitSeed( summary="Add dashboard notes", description="Second private-space commit for browsing stateful history locally.", files=( ( "dashboards/README.md", text_bytes( """ # Dashboard Notes Fixed local fixtures are better than random telemetry when the goal is to reproduce layout and auth bugs. """ ), ), ), ), ), download_path="README.md", download_sessions=1, ), RepoSeed( actor="leo_park", repo_type="space", namespace="leo_park", name="formula-checker-lite", private=False, commits=( CommitSeed( summary="Create public formula checker demo", description="Lightweight public space for user profile and space listings.", files=( ( "README.md", text_bytes( """ # formula-checker-lite Small browser demo that validates spreadsheet-style formulas and flags obviously broken references. """ ), ), ( "app.py", text_bytes( """ import gradio as gr def validate(expr: str) -> str: return "looks valid" if "=" in expr else "missing leading =" demo = gr.Interface( fn=validate, inputs=gr.Textbox(label="Formula"), outputs=gr.Textbox(label="Status"), title="Formula Checker Lite", ) if __name__ == "__main__": demo.launch() """ ), ), ("requirements.txt", text_bytes("gradio>=4.44.0")), ), ), CommitSeed( summary="Add preset expressions", description="Second commit keeps this user-owned space non-trivial.", files=( ( "assets/presets.json", json_bytes( { "valid": "=SUM(A1:A3)", "invalid": "SUM(A1:A3)", "cross_sheet": "=Sheet2!B4", } ), ), ), ), ), download_path="README.md", download_sessions=2, ), RepoSeed( actor="sara_chen", repo_type="dataset", namespace="sara_chen", name="invoice-entities-mini", private=False, commits=( CommitSeed( summary="Seed invoice entity dataset", description="Public user dataset so profile pages are not empty.", files=( ( "README.md", text_bytes( """ --- license: cc-by-4.0 pretty_name: Invoice Entities Mini task_categories: - token-classification --- # invoice-entities-mini Tiny invoice entity dataset for local schema, preview, and table rendering checks. """ ), ), ( "data/train.jsonl", jsonl_bytes( ( { "invoice_id": "inv_1001", "vendor": "Blue Harbor Logistics", "amount": "1240.00", }, { "invoice_id": "inv_1002", "vendor": "Northline Design", "amount": "315.50", }, ) ), ), ( "data/test.jsonl", jsonl_bytes( ( { "invoice_id": "inv_2001", "vendor": "River Street Foods", "amount": "89.20", }, ) ), ), ( "schema.json", json_bytes( { "invoice_id": "string", "vendor": "string", "amount": "string", } ), ), ), ), CommitSeed( summary="Add notebook notes", description="Second public dataset commit for file tree and commit history coverage.", files=( ( "notebooks/README.md", text_bytes( """ # Notebook Notes Keep the local seed tiny. If a preview bug shows up here, it is much easier to reason about than a random large import. """ ), ), ), ), ), download_path="data/train.jsonl", download_sessions=3, ), ) def build_open_media_core_repo_seeds() -> tuple[RepoSeed, ...]: archive_cache: dict[str, tuple[bytes, bytes]] = {} model_bundle_cache: dict[str, dict[str, bytes]] = {} top_level_image_assets = ( SAFEBOORU_IMAGE_ASSETS[:4] + SAFEBOORU_IMAGE_ASSETS[-2:] ) archive_image_assets = SAFEBOORU_IMAGE_ASSETS top_level_media_entries = ( ("media/audio/voices-speech.wav", "voices-speech.wav"), ("media/audio/steam-train-whistle.wav", "steam-train-whistle.wav"), ("media/video/opencv-vtest.avi", "opencv-vtest.avi"), *( (f"media/images/{asset.cache_name}", asset.cache_name) for asset in top_level_image_assets ), ) def archive_bundle() -> tuple[bytes, bytes]: cached = archive_cache.get("bundle") if cached is not None: return cached archived_files = tuple( (f"images/{asset.cache_name}", remote_asset_bytes(asset.cache_name)) for asset in archive_image_assets ) + ( ( "annotations/captions.jsonl", jsonl_bytes( tuple( { "asset": f"images/{asset.cache_name}", "caption": f"SafeBooru fixture mirrored from {asset.source_url}.", "source_url": asset.source_url, "split": "train" if index < 6 else "validation", } for index, asset in enumerate(archive_image_assets) ) ), ), ( "metadata/source-assets.json", json_bytes( { "assets": [ { "path": f"images/{asset.cache_name}", "sha256": asset.sha256, "size": len(remote_asset_bytes(asset.cache_name)), "source_url": asset.source_url, } for asset in archive_image_assets ] } ), ), ) cached = make_indexed_tar_bundle("open-media-archive", archived_files) archive_cache["bundle"] = cached return cached def model_bundle() -> dict[str, bytes]: cached = model_bundle_cache.get("bundle") if cached is not None: return cached shard_specs = ( ( "model-00001-of-00003.safetensors", ( ("language_model.embed_tokens.weight", (7680, 4096)), ("language_model.layers.0.mlp.down_proj.weight", (4096, 2048)), ), ), ( "model-00002-of-00003.safetensors", (("language_model.layers.14.self_attn.q_proj.weight", (8192, 4096)),), ), ( "model-00003-of-00003.safetensors", ( ("language_model.layers.27.mlp.up_proj.weight", (8192, 4096)), ("vision_tower.vision_model.embeddings.class_embedding", (1, 1408)), ), ), ) bundle: dict[str, bytes] = {} total_tensor_bytes = 0 weight_map: dict[str, str] = {} for filename, tensor_specs in shard_specs: payload, tensor_bytes = make_safetensors_bytes( f"vision-language-assistant-3b:{filename}", tensor_specs, ) bundle[filename] = payload total_tensor_bytes += tensor_bytes for tensor_name, _ in tensor_specs: weight_map[tensor_name] = filename bundle["model.safetensors.index.json"] = json_bytes( { "metadata": {"total_size": total_tensor_bytes}, "weight_map": weight_map, } ) model_bundle_cache["bundle"] = bundle return bundle multimodal_files: tuple[SeedFile, ...] = ( ( "README.md", text_bytes( """ --- license: cc-by-4.0 pretty_name: Open Media Multimodal Suite task_categories: - automatic-speech-recognition - image-to-text - video-classification tags: - parquet - indexed-tar - multimodal --- # multimodal-benchmark-suite Local benchmark dataset with real parquet shards, a hfutils.index-compatible tar archive, a larger SafeBooru image set, torchaudio sample WAV files, and an OpenCV sample video for frontend and admin demos. """ ), ), ( "dataset_infos.json", json_bytes( { "default": { "config_name": "default", "features": { "caption": {"dtype": "string", "_type": "Value"}, "duration_seconds": {"dtype": "float32", "_type": "Value"}, "payload": {"dtype": "binary", "_type": "Value"}, "sample_id": {"dtype": "string", "_type": "Value"}, }, "splits": { "train": { "name": "train", "num_examples": 12000, } }, } } ), ), ( "metadata/feature-card.json", json_bytes( { "archive_index": "archives/raw-bundle-0000.json", "archive_tar": "archives/raw-bundle-0000.tar", "media_assets": [path for path, _ in top_level_media_entries], "parquet_train": "parquet/train-00000-of-00001.parquet", } ), ), ( "metadata/source-assets.json", json_bytes( { "assets": [ { "path": path, "sha256": REMOTE_MEDIA_ASSETS[asset_name].sha256, "size": len(remote_asset_bytes(asset_name)), "source_url": REMOTE_MEDIA_ASSETS[asset_name].source_url, } for path, asset_name in top_level_media_entries ] } ), ), seed_file( "parquet/train-00000-of-00001.parquet", lambda: make_parquet_bytes("open-media-train", row_count=12000, payload_size=2048), ), seed_file( "parquet/validation-00000-of-00001.parquet", lambda: make_parquet_bytes("open-media-validation", row_count=1500, payload_size=1024), ), # Real HF-sourced parquet so the pure-client preview (issue #27) # can be exercised against a file that actually came off the # Hugging Face hub, not just locally generated pyarrow output. seed_file( "fixtures/hf-no-robots-test.parquet", lambda: remote_asset_bytes("hf-no-robots-test.parquet"), ), *( seed_file(path, lambda asset_name=asset_name: remote_asset_bytes(asset_name)) for path, asset_name in top_level_media_entries ), seed_file("archives/raw-bundle-0000.tar", lambda: archive_bundle()[0]), seed_file("archives/raw-bundle-0000.json", lambda: archive_bundle()[1]), ) model_files: tuple[SeedFile, ...] = ( ( "README.md", text_bytes( """ --- license: apache-2.0 library_name: transformers pipeline_tag: image-text-to-text tags: - multimodal - sharded-weights - local-dev --- # vision-language-assistant-3b Local multimodal checkpoint with real sharded safetensors weights, tokenizer assets, and processor configs. """ ), ), ( "config.json", json_bytes( { "architectures": ["LlavaForConditionalGeneration"], "hidden_size": 3072, "max_position_embeddings": 8192, "model_type": "llava", "num_hidden_layers": 28, "torch_dtype": "bfloat16", "vocab_size": 128256, } ), ), ( "generation_config.json", json_bytes( { "do_sample": False, "max_new_tokens": 512, "temperature": 0.2, "top_p": 0.9, } ), ), ( "preprocessor_config.json", json_bytes( { "crop_size": 448, "do_center_crop": True, "do_normalize": True, "image_mean": [0.48145466, 0.4578275, 0.40821073], "image_std": [0.26862954, 0.26130258, 0.27577711], } ), ), ( "processor_config.json", json_bytes( { "chat_template": "chat_template.jinja", "image_processor_type": "CLIPImageProcessor", "processor_class": "AutoProcessor", "tokenizer_class": "PreTrainedTokenizerFast", } ), ), ( "special_tokens_map.json", json_bytes( { "bos_token": "", "eos_token": "", "image_token": "", "pad_token": "", } ), ), ( "tokenizer_config.json", json_bytes( { "add_bos_token": True, "chat_template": "{% for message in messages %}{{ message['role'] }}: {{ message['content'] }}{% endfor %}", "legacy": False, "model_max_length": 8192, "padding_side": "right", } ), ), ( "tokenizer.json", json_bytes( { "added_tokens": [{"content": "", "id": 128000}], "normalizer": {"type": "NFKC"}, "pre_tokenizer": {"type": "ByteLevel"}, "version": "1.0", } ), ), ( "chat_template.jinja", text_bytes( "{{ bos_token }}{% for message in messages %}{{ message['role'] }}: {{ message['content'] }}{% endfor %}{{ eos_token }}" ), ), ( "README.weights.md", text_bytes( """ # Weight Layout The checkpoint is intentionally sharded into valid safetensors files so local LFS upload, download, and tree views can exercise a few hundred megabytes of realistic model payloads. """ ), ), seed_file( "model.safetensors.index.json", lambda: model_bundle()["model.safetensors.index.json"], ), seed_file( "model-00001-of-00003.safetensors", lambda: model_bundle()["model-00001-of-00003.safetensors"], ), seed_file( "model-00002-of-00003.safetensors", lambda: model_bundle()["model-00002-of-00003.safetensors"], ), seed_file( "model-00003-of-00003.safetensors", lambda: model_bundle()["model-00003-of-00003.safetensors"], ), # Real HF-sourced safetensors (tiny-random-bert, ~520 KB) so the # pure-client preview (issue #27) can be exercised against a file # that actually came off the Hugging Face hub, not just locally # generated safetensors.numpy.save output. seed_file( "fixtures/hf-tiny-random-bert.safetensors", lambda: remote_asset_bytes("hf-tiny-random-bert.safetensors"), ), ) return ( RepoSeed( actor="mai_lin", repo_type="dataset", namespace="open-media-lab", name="multimodal-benchmark-suite", private=False, commits=( CommitSeed( summary="Seed multimodal benchmark suite", description=( "Add a real parquet shard, indexed tar archive, and common media " "formats to exercise local dataset browsing and LFS flows." ), files=multimodal_files, ), CommitSeed( summary="Add archive notes and split manifest", description="Keep the multimodal dataset active with a second commit and metadata refresh.", files=( ( "notes/archive-layout.md", text_bytes( """ # Archive Layout The indexed tar bundle mirrors the hfutils.index layout so local demos can inspect offsets, file sizes, and per-member checksums. """ ), ), ( "metadata/splits.json", json_bytes( { "train": "parquet/train-00000-of-00001.parquet", "validation": "parquet/validation-00000-of-00001.parquet", } ), ), ), ), ), branch="curation-pass", tag="v2026.04-media", download_path="parquet/train-00000-of-00001.parquet", download_sessions=6, ), RepoSeed( actor="mai_lin", repo_type="model", namespace="open-media-lab", name="vision-language-assistant-3b", private=False, commits=( CommitSeed( summary="Publish sharded multimodal assistant checkpoint", description=( "Add common Hugging Face model files and a few hundred megabytes " "of sharded safetensors weights." ), files=model_files, ), CommitSeed( summary="Add eval cards and prompt notes", description="Follow-up commit for model history, metadata, and release-note views.", files=( ( "eval/benchmark.json", json_bytes( { "chart_qa_em": 0.71, "docvqa_anls": 0.63, "latency_ms_p95": 186, } ), ), ( "prompts/system.md", text_bytes( """ # System Prompt Notes - prefer grounded answers over speculative OCR recovery - preserve visible numbers and units - mention image regions when ambiguity remains """ ), ), ), ), ), branch="eval-refresh", tag="v0.9.0-local", download_path="model-00001-of-00003.safetensors", download_sessions=4, ), RepoSeed( actor="mai_lin", repo_type="dataset", namespace="open-media-lab", name="hierarchy-crawl-fixtures", private=False, commits=( CommitSeed( summary="Seed deeply nested tree fixtures", description=( "Generate a repo with many files and several levels of nested paths " "for tree navigation and search coverage." ), files=make_deep_tree_files("hierarchy-crawl"), ), CommitSeed( summary="Add tree smoke-test notes", description="Keep one extra commit so history and diff views remain non-trivial.", files=( ( "notes/path-review.md", text_bytes( """ # Path Review This repo exists to keep large tree browsing reproducible. When a pagination or sorting bug appears, use these fixtures first. """ ), ), ), ), ), branch="path-review", tag="tree-fixtures-2026-04", download_path=( "catalog/section-06/tier-08/branch-06/node-06-08-06/" "entry-06-08-06.json" ), download_sessions=2, ), ) def build_open_media_showcase_repo_seeds() -> tuple[RepoSeed, ...]: specs = ( ("model", "dock-caption-lite", False, "dock captioning smoke-test model"), ("dataset", "quay-ops-snippets", False, "operations dataset for list and preview checks"), ("space", "repo-browser-demo", False, "space used to pin org landing content"), ("model", "layout-distill-small", False, "small layout parser release used for org pages"), ("dataset", "table-scan-fixtures", False, "table extraction fixtures for repeated browsing"), ("space", "taxonomy-review-room", True, "private review board for annotation changes"), ("model", "invoice-embeddings-small", False, "embedding checkpoint metadata fixture"), ("dataset", "ui-search-fixtures", False, "search and pagination samples"), ("space", "annotation-hotfix-board", True, "private space for triage workflows"), ("model", "signal-router-mini", False, "tiny routing model used in showcase cards"), ) repos: list[RepoSeed] = [] for repo_type, name, private, summary in specs: readme = text_bytes( f""" # {name} {summary.capitalize()}. This repository exists to give open-media-lab a realistic repo count in local dev. """ ) if repo_type == "model": files: tuple[SeedFile, ...] = ( ("README.md", readme), ( "config.json", json_bytes( { "hidden_size": 768, "model_type": name, "num_hidden_layers": 12, } ), ), seed_file( f"weights/{name}.safetensors", lambda name=name: make_single_checkpoint_bytes( name, ( ("model.embed_tokens.weight", (2048, 1024)), ("model.layers.0.mlp.up_proj.weight", (1024, 512)), ), ), ), ) download_path = f"weights/{name}.safetensors" elif repo_type == "dataset": files = ( ("README.md", readme), ( "data/rows.jsonl", jsonl_bytes( ( {"id": f"{name}-0001", "label": "alpha"}, {"id": f"{name}-0002", "label": "beta"}, ) ), ), ( "metadata/features.json", json_bytes({"id": "string", "label": "string"}), ), ) download_path = "data/rows.jsonl" else: files = ( ("README.md", readme), ( "app.py", text_bytes( f""" import gradio as gr demo = gr.Interface( fn=lambda text: "{name}: " + text.strip(), inputs=gr.Textbox(label="Input"), outputs=gr.Textbox(label="Output"), title="{name}", ) if __name__ == "__main__": demo.launch() """ ), ), ("requirements.txt", text_bytes("gradio>=4.44.0")), ) download_path = "README.md" repos.append( RepoSeed( actor="mai_lin", repo_type=repo_type, namespace="open-media-lab", name=name, private=private, commits=( CommitSeed( summary=f"Seed {name}", description="Create a compact org repo so the listing page has real density.", files=files, ), ), download_path=download_path, download_sessions=1 if not private else 0, ) ) return tuple(repos) REPO_SEEDS = ( build_repo_seeds() + build_open_media_core_repo_seeds() + build_open_media_showcase_repo_seeds() ) LIKES: tuple[tuple[str, str, str, str], ...] = ( ("leo_park", "model", "mai_lin", "lineart-caption-base"), ("leo_park", "dataset", "mai_lin", "street-sign-zh-en"), ("leo_park", "model", "harbor-vision", "marine-seg-small"), ("sara_chen", "model", "mai_lin", "lineart-caption-base"), ("sara_chen", "model", "aurora-labs", "aurora-ocr-lite"), ("sara_chen", "dataset", "aurora-labs", "receipt-layout-bench"), ("noah_kim", "model", "aurora-labs", "aurora-ocr-lite"), ("noah_kim", "dataset", "mai_lin", "street-sign-zh-en"), ("noah_kim", "space", "leo_park", "formula-checker-lite"), ("ivy_ops", "model", "mai_lin", "lineart-caption-base"), ("ivy_ops", "model", "aurora-labs", "aurora-ocr-lite"), ("ivy_ops", "dataset", "sara_chen", "invoice-entities-mini"), ("mai_lin", "model", "harbor-vision", "marine-seg-small"), ("mai_lin", "space", "leo_park", "formula-checker-lite"), ("mai_lin", "dataset", "aurora-labs", "receipt-layout-bench"), ) # Global fallback sources installed via the admin API so a fresh local seed can # resolve public HuggingFace repos out-of-the-box. Namespace "" = global scope. FALLBACK_SOURCE_SEEDS: tuple[dict, ...] = ( { "namespace": "", "url": "https://huggingface.co", "token": None, "priority": 1000, "name": "HuggingFace", "source_type": "huggingface", "enabled": True, }, ) def account_index() -> dict[str, AccountSeed]: return {account.username: account for account in ACCOUNTS} def repo_slug(repo: RepoSeed) -> str: return f"{repo.repo_type}-{repo.namespace}-{repo.name}".replace("/", "-") def make_avatar_bytes(label: str, background: str, accent: str) -> bytes: image = Image.new("RGB", (512, 512), background) draw = ImageDraw.Draw(image) draw.rounded_rectangle((48, 48, 464, 464), radius=96, outline=accent, width=16) draw.ellipse((120, 120, 392, 392), fill=accent) initials = "".join(part[0].upper() for part in label.replace("-", " ").split()[:2]) font = ImageFont.load_default() text_box = draw.textbbox((0, 0), initials, font=font) text_width = text_box[2] - text_box[0] text_height = text_box[3] - text_box[1] draw.text( ((512 - text_width) / 2, (512 - text_height) / 2), initials, fill=background, font=font, ) buffer = io.BytesIO() image.save(buffer, format="PNG") return buffer.getvalue() def describe_error(response: httpx.Response) -> str: try: payload = response.json() except Exception: payload = response.text return f"HTTP {response.status_code}: {payload}" async def ensure_response( response: httpx.Response, action: str, allowed_statuses: tuple[int, ...] = (200,), ) -> httpx.Response: if response.status_code not in allowed_statuses: raise SeedError(f"{action} failed with {describe_error(response)}") return response def url_to_internal_path(url: str) -> str: parsed = urlsplit(url) path = parsed.path or "/" if parsed.query: path = f"{path}?{parsed.query}" return path def manifest_matches_current_seed() -> bool: if not MANIFEST_PATH.exists(): return False try: payload = json.loads(MANIFEST_PATH.read_text(encoding="utf-8")) except Exception: return False return payload.get("seed_version") == SEED_VERSION def representative_seed_repositories() -> tuple[RepoSeed, ...]: seen_types: set[str] = set() selected: list[RepoSeed] = [] for repo in REPO_SEEDS: if repo.private or repo.repo_type in seen_types: continue seen_types.add(repo.repo_type) selected.append(repo) return tuple(selected) async def detect_seed_state(client: httpx.AsyncClient) -> str: response = await client.get( f"/api/users/{PRIMARY_USERNAME}/type", params={"fallback": "false"}, ) if response.status_code == 404: return "missing" await ensure_response(response, f"check existing seed for {PRIMARY_USERNAME}") if not manifest_matches_current_seed(): return "incomplete" for repo in representative_seed_repositories(): info_response = await client.get(f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}") if info_response.status_code == 404: return "incomplete" await ensure_response( info_response, f"verify seeded repo metadata for {repo.namespace}/{repo.name}", ) tree_response = await client.get( f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/tree/main" ) if tree_response.status_code == 404: return "incomplete" await ensure_response( tree_response, f"verify seeded repo storage for {repo.namespace}/{repo.name}", ) return "ready" async def register_account(client: httpx.AsyncClient, account: AccountSeed) -> None: response = await client.post( "/api/auth/register", json={ "username": account.username, "email": account.email, "password": DEFAULT_PASSWORD, }, ) if response.status_code == 200: return if response.status_code == 400: message = str(response.json()) if "exists" in message or "conflicts" in message: return raise SeedError(f"register {account.username} failed with {describe_error(response)}") async def login_account(client: httpx.AsyncClient, account: AccountSeed) -> None: response = await client.post( "/api/auth/login", json={"username": account.username, "password": DEFAULT_PASSWORD}, ) await ensure_response(response, f"login {account.username}") if "session_id" not in client.cookies: raise SeedError(f"login {account.username} did not set a session cookie") async def upload_avatar( client: httpx.AsyncClient, path: str, label: str, background: str, accent: str, ) -> None: response = await client.post( path, files={ "file": ( f"{label}.png", make_avatar_bytes(label, background, accent), "image/png", ) }, ) await ensure_response(response, f"upload avatar for {label}") async def configure_user_profile(client: httpx.AsyncClient, account: AccountSeed) -> None: response = await client.put( f"/api/users/{account.username}/settings", json={ "email": account.email, "full_name": account.full_name, "bio": account.bio, "website": account.website, "social_media": account.social_media, }, ) await ensure_response(response, f"update user settings for {account.username}") await upload_avatar( client, f"/api/users/{account.username}/avatar", account.username, account.avatar_bg, account.avatar_accent, ) def admin_headers() -> dict[str, str]: return {"X-Admin-Token": cfg.admin.secret_token} async def ensure_fallback_source( client: httpx.AsyncClient, source: dict ) -> None: list_response = await client.get( "/admin/api/fallback-sources", params={"namespace": source["namespace"]}, headers=admin_headers(), ) await ensure_response( list_response, f"list fallback sources for namespace={source['namespace']!r}", ) normalized_url = source["url"].rstrip("/") for existing in list_response.json(): if existing["url"].rstrip("/") == normalized_url: return create_response = await client.post( "/admin/api/fallback-sources", json=source, headers=admin_headers(), ) await ensure_response( create_response, f"create fallback source {source['name']} ({normalized_url})", ) async def create_organization( client: httpx.AsyncClient, organization: OrganizationSeed ) -> None: response = await client.post( "/org/create", json={ "name": organization.name, "description": organization.description, }, ) if response.status_code == 200: return if response.status_code == 400 and "already exists" in str(response.json()): return raise SeedError( f"create organization {organization.name} failed with {describe_error(response)}" ) async def ensure_org_member( client: httpx.AsyncClient, org_name: str, username: str, role: str, ) -> None: response = await client.post( f"/org/{org_name}/members", json={"username": username, "role": role}, ) if response.status_code not in (200, 400): raise SeedError( f"add {username} to {org_name} failed with {describe_error(response)}" ) # PUT keeps roles deterministic even if the member already existed. response = await client.put( f"/org/{org_name}/members/{username}", json={"role": role}, ) await ensure_response(response, f"set role for {username} in {org_name}") async def configure_organization( client: httpx.AsyncClient, organization: OrganizationSeed ) -> None: response = await client.put( f"/api/organizations/{organization.name}/settings", json={ "description": organization.description, "bio": organization.bio, "website": organization.website, "social_media": organization.social_media, }, ) await ensure_response(response, f"update organization settings for {organization.name}") await upload_avatar( client, f"/api/organizations/{organization.name}/avatar", organization.name, organization.avatar_bg, organization.avatar_accent, ) async def create_repo(client: httpx.AsyncClient, repo: RepoSeed) -> None: payload = { "type": repo.repo_type, "name": repo.name, "private": repo.private, } if repo.namespace != repo.actor: payload["organization"] = repo.namespace response = await client.post("/api/repos/create", json=payload) if response.status_code == 200: return if response.status_code == 400 and "already exists" in str(response.json()): return raise SeedError(f"create repo {repo.namespace}/{repo.name} failed with {describe_error(response)}") async def upload_lfs_object( client: httpx.AsyncClient, repo: RepoSeed, content: bytes, ) -> tuple[str, int]: oid = hashlib.sha256(content).hexdigest() size = len(content) response = await client.post( f"/{repo.repo_type}s/{repo.namespace}/{repo.name}.git/info/lfs/objects/batch", json={ "operation": "upload", "transfers": ["basic"], "objects": [{"oid": oid, "size": size}], "hash_algo": "sha256", # Local dev uses the frontend base_url publicly, so the seed script rewrites # verify URLs back onto the in-process backend transport. "is_browser": True, }, ) await ensure_response(response, f"prepare LFS upload for {repo.namespace}/{repo.name}") batch_data = response.json() obj = batch_data["objects"][0] if obj.get("error"): raise SeedError(f"LFS batch returned an error for {repo.namespace}/{repo.name}: {obj['error']}") upload_action = (obj.get("actions") or {}).get("upload") if upload_action: upload_headers = upload_action.get("header") or {} async with httpx.AsyncClient(follow_redirects=False, timeout=60.0) as network_client: upload_response = await network_client.put( upload_action["href"], content=content, headers=upload_headers, ) if upload_response.status_code not in (200, 201): raise SeedError( f"LFS upload failed for {repo.namespace}/{repo.name}: " f"HTTP {upload_response.status_code} {upload_response.text}" ) verify_action = (obj.get("actions") or {}).get("verify") if verify_action: verify_response = await client.post( url_to_internal_path(verify_action["href"]), json={"oid": oid, "size": size}, ) await ensure_response( verify_response, f"verify LFS upload for {repo.namespace}/{repo.name}", ) return oid, size async def commit_files( client: httpx.AsyncClient, repo: RepoSeed, commit: CommitSeed, ) -> None: materialized_files = [materialize_seed_file(file_entry) for file_entry in commit.files] metadata = [] for path, content in materialized_files: sha256 = hashlib.sha256(content).hexdigest() metadata.append( { "path": path, "size": len(content), "sha256": sha256, } ) preupload_response = await client.post( f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/preupload/main", json={"files": metadata}, ) await ensure_response( preupload_response, f"preupload {repo.namespace}/{repo.name}", ) preupload_results = { item["path"]: item for item in preupload_response.json().get("files", []) } ndjson_lines = [ { "key": "header", "value": { "summary": commit.summary, "description": commit.description, }, } ] for path, content in materialized_files: mode = preupload_results[path]["uploadMode"] if preupload_results[path]["shouldIgnore"]: continue if mode == "lfs": oid, size = await upload_lfs_object(client, repo, content) ndjson_lines.append( { "key": "lfsFile", "value": { "path": path, "oid": oid, "size": size, "algo": "sha256", }, } ) continue ndjson_lines.append( { "key": "file", "value": { "path": path, "content": base64.b64encode(content).decode("ascii"), "encoding": "base64", }, } ) ndjson_payload = "\n".join(json.dumps(line, sort_keys=True) for line in ndjson_lines) response = await client.post( f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/commit/main", content=ndjson_payload, headers={"Content-Type": "application/x-ndjson"}, ) await ensure_response(response, f"commit {repo.namespace}/{repo.name}") async def create_branch(client: httpx.AsyncClient, repo: RepoSeed) -> None: if not repo.branch: return response = await client.post( f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/branch", json={"branch": repo.branch, "revision": "main"}, ) if response.status_code == 200: return if response.status_code in (400, 409) and "already exists" in str(response.json()): return raise SeedError( f"create branch {repo.branch} for {repo.namespace}/{repo.name} failed with " f"{describe_error(response)}" ) async def create_tag(client: httpx.AsyncClient, repo: RepoSeed) -> None: if not repo.tag: return response = await client.post( f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/tag", json={"tag": repo.tag, "revision": "main"}, ) if response.status_code == 200: return if response.status_code in (400, 409) and "already exists" in str(response.json()): return raise SeedError( f"create tag {repo.tag} for {repo.namespace}/{repo.name} failed with " f"{describe_error(response)}" ) async def like_repo( client: httpx.AsyncClient, repo_type: str, namespace: str, name: str, ) -> None: response = await client.post(f"/api/{repo_type}s/{namespace}/{name}/like") if response.status_code == 200: return if response.status_code == 400 and "already liked" in str(response.json()): return raise SeedError( f"like {repo_type}/{namespace}/{name} failed with {describe_error(response)}" ) async def trigger_download( client: httpx.AsyncClient, repo: RepoSeed, path: str, *, cookies: dict[str, str] | None = None, ) -> None: response = await client.get( f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/resolve/main/{path}", cookies=cookies, ) if response.status_code not in (302, 307): raise SeedError( f"download seed for {repo.namespace}/{repo.name}:{path} failed with " f"{describe_error(response)}" ) def build_manifest() -> dict: return { "seed_version": SEED_VERSION, "manifest_path": str(MANIFEST_PATH), "main_ui_url": cfg.app.base_url, "backend_url": INTERNAL_BASE_URL, "main_login": { "username": PRIMARY_USERNAME, "password": DEFAULT_PASSWORD, }, "additional_users": [ { "username": account.username, "password": DEFAULT_PASSWORD, "email": account.email, } for account in ACCOUNTS if account.username != PRIMARY_USERNAME ], "admin_ui": { "url": "http://127.0.0.1:5174", "token": cfg.admin.secret_token, }, "organizations": [ { "name": organization.name, "members": [ {"username": username, "role": role} for username, role in organization.members ], } for organization in ORGANIZATIONS ], "repositories": [ { "type": repo.repo_type, "namespace": repo.namespace, "name": repo.name, "private": repo.private, } for repo in REPO_SEEDS ], "fallback_sources": [ { "namespace": source["namespace"], "url": source["url"].rstrip("/"), "name": source["name"], "source_type": source["source_type"], "priority": source["priority"], } for source in FALLBACK_SOURCE_SEEDS ], } def write_manifest() -> None: MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True) MANIFEST_PATH.write_text( json.dumps(build_manifest(), indent=2, sort_keys=True) + "\n", encoding="utf-8", ) def print_summary(seed_applied: bool) -> None: state = "Seeded" if seed_applied else "Seed already present" print(f"{state}: {SEED_VERSION}") print(f"Manifest: {MANIFEST_PATH}") print(f"Main UI: {cfg.app.base_url}") print(f"Backend: {INTERNAL_BASE_URL}") print(f"Login: {PRIMARY_USERNAME} / {DEFAULT_PASSWORD}") print(f"Admin UI token: {cfg.admin.secret_token}") async def seed_demo_data() -> None: init_storage() transport = httpx.ASGITransport(app=app) accounts_by_name = account_index() async with AsyncExitStack() as stack: seed_client = await stack.enter_async_context( httpx.AsyncClient( transport=transport, base_url=INTERNAL_BASE_URL, follow_redirects=False, ) ) seed_state = await detect_seed_state(seed_client) if seed_state == "ready": write_manifest() print_summary(seed_applied=False) return if seed_state == "incomplete": raise SeedError( "Local demo seed is only partially present. " "Run `make reset-local-data` and then retry `make seed-demo`." ) for account in ACCOUNTS: await register_account(seed_client, account) for fallback_source in FALLBACK_SOURCE_SEEDS: await ensure_fallback_source(seed_client, fallback_source) authed_clients: dict[str, httpx.AsyncClient] = {} for account in ACCOUNTS: client = await stack.enter_async_context( httpx.AsyncClient( transport=transport, base_url=INTERNAL_BASE_URL, follow_redirects=False, ) ) await login_account(client, account) await configure_user_profile(client, account) authed_clients[account.username] = client primary_client = authed_clients[PRIMARY_USERNAME] for organization in ORGANIZATIONS: await create_organization(primary_client, organization) for username, role in organization.members: if username == PRIMARY_USERNAME: continue await ensure_org_member(primary_client, organization.name, username, role) await configure_organization(primary_client, organization) for repo in REPO_SEEDS: repo_client = authed_clients[repo.actor] await create_repo(repo_client, repo) for commit in repo.commits: await commit_files(repo_client, repo, commit) await create_branch(repo_client, repo) await create_tag(repo_client, repo) for liker, repo_type, namespace, name in LIKES: await like_repo(authed_clients[liker], repo_type, namespace, name) anon_client = await stack.enter_async_context( httpx.AsyncClient( transport=transport, base_url=INTERNAL_BASE_URL, follow_redirects=False, ) ) for repo in REPO_SEEDS: if not repo.download_path: continue if repo.private: await trigger_download( authed_clients[PRIMARY_USERNAME], repo, repo.download_path, ) continue for session_number in range(repo.download_sessions): await trigger_download( anon_client, repo, repo.download_path, cookies={ "hf_download_session": f"seed-{repo_slug(repo)}-{session_number:02d}" }, ) # Download tracking happens in background tasks off the API response path. await asyncio.sleep(0.5) write_manifest() print_summary(seed_applied=True) def main() -> int: try: asyncio.run(seed_demo_data()) except SeedError as exc: print(f"Seed failed: {exc}", file=sys.stderr) return 1 return 0 if __name__ == "__main__": raise SystemExit(main())