mirror of
https://github.com/KohakuBlueleaf/KohakuHub.git
synced 2026-05-08 04:47:53 -05:00
1989 lines
71 KiB
Python
Executable File
1989 lines
71 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Create deterministic local demo data through KohakuHub's API surface."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import base64
|
|
import hashlib
|
|
import io
|
|
import json
|
|
import sys
|
|
import textwrap
|
|
from contextlib import AsyncExitStack
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
from urllib.parse import urlsplit
|
|
|
|
import httpx
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
ROOT_DIR = Path(__file__).resolve().parents[2]
|
|
SRC_DIR = ROOT_DIR / "src"
|
|
if str(SRC_DIR) not in sys.path:
|
|
sys.path.insert(0, str(SRC_DIR))
|
|
|
|
from kohakuhub.config import cfg
|
|
from kohakuhub.main import app
|
|
from kohakuhub.utils.s3 import init_storage
|
|
|
|
SEED_VERSION = "local-dev-demo-v1"
|
|
DEFAULT_PASSWORD = "KohakuDev123!"
|
|
PRIMARY_USERNAME = "mai_lin"
|
|
MANIFEST_PATH = ROOT_DIR / "hub-meta" / "dev" / "demo-seed-manifest.json"
|
|
INTERNAL_BASE_URL = (
|
|
getattr(cfg.app, "internal_base_url", None)
|
|
or cfg.app.base_url
|
|
or "http://127.0.0.1:48888"
|
|
)
|
|
|
|
|
|
class SeedError(RuntimeError):
|
|
"""Raised when demo data creation fails."""
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class AccountSeed:
|
|
username: str
|
|
email: str
|
|
full_name: str
|
|
bio: str
|
|
website: str
|
|
social_media: dict[str, str]
|
|
avatar_bg: str
|
|
avatar_accent: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class OrganizationSeed:
|
|
name: str
|
|
description: str
|
|
bio: str
|
|
website: str
|
|
social_media: dict[str, str]
|
|
avatar_bg: str
|
|
avatar_accent: str
|
|
members: tuple[tuple[str, str], ...]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class CommitSeed:
|
|
summary: str
|
|
description: str
|
|
files: tuple[tuple[str, bytes], ...]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class RepoSeed:
|
|
actor: str
|
|
repo_type: str
|
|
namespace: str
|
|
name: str
|
|
private: bool
|
|
commits: tuple[CommitSeed, ...]
|
|
branch: str | None = None
|
|
tag: str | None = None
|
|
download_path: str | None = None
|
|
download_sessions: int = 0
|
|
|
|
|
|
ACCOUNTS: tuple[AccountSeed, ...] = (
|
|
AccountSeed(
|
|
username="mai_lin",
|
|
email="mai.lin@kohakuhub.dev",
|
|
full_name="Mai Lin",
|
|
bio=(
|
|
"Product-minded ML engineer focused on reproducible dataset QA, "
|
|
"small-model packaging, and local debugging workflows."
|
|
),
|
|
website="https://kohakuhub.local/mai-lin",
|
|
social_media={
|
|
"github": "mai-lin-labs",
|
|
"huggingface": "mai-lin-labs",
|
|
"twitter_x": "mai_lin_ops",
|
|
},
|
|
avatar_bg="#183153",
|
|
avatar_accent="#f59e0b",
|
|
),
|
|
AccountSeed(
|
|
username="leo_park",
|
|
email="leo.park@kohakuhub.dev",
|
|
full_name="Leo Park",
|
|
bio=(
|
|
"Frontend-heavy engineer who keeps repo demos honest with browser "
|
|
"smoke tests and hand-curated example data."
|
|
),
|
|
website="https://kohakuhub.local/leo-park",
|
|
social_media={
|
|
"github": "leo-park-dev",
|
|
"threads": "leo.park.dev",
|
|
},
|
|
avatar_bg="#0f766e",
|
|
avatar_accent="#f8fafc",
|
|
),
|
|
AccountSeed(
|
|
username="sara_chen",
|
|
email="sara.chen@kohakuhub.dev",
|
|
full_name="Sara Chen",
|
|
bio=(
|
|
"Annotation lead for invoice, receipt, and layout-heavy datasets. "
|
|
"Prefers clean schemas over magical post-processing."
|
|
),
|
|
website="https://kohakuhub.local/sara-chen",
|
|
social_media={
|
|
"github": "sara-chen-data",
|
|
"huggingface": "sara-chen-data",
|
|
},
|
|
avatar_bg="#7c2d12",
|
|
avatar_accent="#fde68a",
|
|
),
|
|
AccountSeed(
|
|
username="noah_kim",
|
|
email="noah.kim@kohakuhub.dev",
|
|
full_name="Noah Kim",
|
|
bio=(
|
|
"Ships compact vision models for harbor monitoring, segmentation, "
|
|
"and camera-side smoke testing."
|
|
),
|
|
website="https://kohakuhub.local/noah-kim",
|
|
social_media={
|
|
"github": "noah-kim-vision",
|
|
"twitter_x": "noahkimvision",
|
|
},
|
|
avatar_bg="#1d4ed8",
|
|
avatar_accent="#dbeafe",
|
|
),
|
|
AccountSeed(
|
|
username="ivy_ops",
|
|
email="ivy.ops@kohakuhub.dev",
|
|
full_name="Ivy Ops",
|
|
bio=(
|
|
"Release and infra support. Uses stable, boring fixtures so bug "
|
|
"reports stay reproducible."
|
|
),
|
|
website="https://kohakuhub.local/ivy-ops",
|
|
social_media={
|
|
"github": "ivy-ops",
|
|
},
|
|
avatar_bg="#3f3f46",
|
|
avatar_accent="#f4f4f5",
|
|
),
|
|
)
|
|
|
|
ORGANIZATIONS: tuple[OrganizationSeed, ...] = (
|
|
OrganizationSeed(
|
|
name="aurora-labs",
|
|
description=(
|
|
"Applied document intelligence team building OCR-friendly models, "
|
|
"datasets, and lightweight internal tooling."
|
|
),
|
|
bio=(
|
|
"Aurora Labs curates multilingual OCR assets for receipts, forms, "
|
|
"and customer-service automation."
|
|
),
|
|
website="https://aurora-labs.kohakuhub.local",
|
|
social_media={
|
|
"github": "aurora-labs",
|
|
"huggingface": "aurora-labs",
|
|
},
|
|
avatar_bg="#312e81",
|
|
avatar_accent="#e0e7ff",
|
|
members=(
|
|
("mai_lin", "super-admin"),
|
|
("leo_park", "admin"),
|
|
("sara_chen", "member"),
|
|
("ivy_ops", "visitor"),
|
|
),
|
|
),
|
|
OrganizationSeed(
|
|
name="harbor-vision",
|
|
description=(
|
|
"Small computer-vision team for coastal monitoring, dock safety, "
|
|
"and camera-ready deployment checks."
|
|
),
|
|
bio=(
|
|
"Harbor Vision maintains compact segmentation and inspection models "
|
|
"for edge-friendly marine operations."
|
|
),
|
|
website="https://harbor-vision.kohakuhub.local",
|
|
social_media={
|
|
"github": "harbor-vision",
|
|
"twitter_x": "harborvision",
|
|
},
|
|
avatar_bg="#0f766e",
|
|
avatar_accent="#ccfbf1",
|
|
members=(
|
|
("mai_lin", "super-admin"),
|
|
("noah_kim", "super-admin"),
|
|
("leo_park", "visitor"),
|
|
),
|
|
),
|
|
)
|
|
|
|
|
|
def text_bytes(body: str) -> bytes:
|
|
return (textwrap.dedent(body).strip() + "\n").encode("utf-8")
|
|
|
|
|
|
def json_bytes(payload: dict | list) -> bytes:
|
|
return (json.dumps(payload, indent=2, sort_keys=True) + "\n").encode("utf-8")
|
|
|
|
|
|
def csv_bytes(rows: Iterable[Iterable[str]]) -> bytes:
|
|
lines = [",".join(row) for row in rows]
|
|
return ("\n".join(lines) + "\n").encode("utf-8")
|
|
|
|
|
|
def jsonl_bytes(rows: Iterable[dict]) -> bytes:
|
|
return ("\n".join(json.dumps(row, sort_keys=True) for row in rows) + "\n").encode(
|
|
"utf-8"
|
|
)
|
|
|
|
|
|
def profile_space_files(title: str, summary: str, accent: str) -> tuple[tuple[str, bytes], ...]:
|
|
return (
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
f"""
|
|
---
|
|
title: {title}
|
|
emoji: "\u2605"
|
|
colorFrom: indigo
|
|
colorTo: amber
|
|
sdk: gradio
|
|
sdk_version: "4.44.0"
|
|
---
|
|
|
|
# {title}
|
|
|
|
{summary}
|
|
|
|
This space exists so local profile pages render with realistic content
|
|
instead of an empty placeholder repository.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"app.py",
|
|
text_bytes(
|
|
f"""
|
|
import gradio as gr
|
|
|
|
demo = gr.Interface(
|
|
fn=lambda text: "{title}: " + text.strip(),
|
|
inputs=gr.Textbox(label="Prompt"),
|
|
outputs=gr.Textbox(label="Response"),
|
|
title="{title}",
|
|
description="{summary}",
|
|
theme=gr.themes.Soft(primary_hue="{accent}"),
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch()
|
|
"""
|
|
),
|
|
),
|
|
("requirements.txt", text_bytes("gradio>=4.44.0")),
|
|
)
|
|
|
|
|
|
def lfs_blob(label: str) -> bytes:
|
|
header = f"SEED-LFS::{label}\n".encode("utf-8")
|
|
return header + (b"0123456789abcdef" * 64)
|
|
|
|
|
|
def build_repo_seeds() -> tuple[RepoSeed, ...]:
|
|
return (
|
|
RepoSeed(
|
|
actor="mai_lin",
|
|
repo_type="model",
|
|
namespace="mai_lin",
|
|
name="lineart-caption-base",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Bootstrap base caption model",
|
|
description=(
|
|
"Create the public demo model repo with a realistic README, "
|
|
"lightweight config, and a small LFS-tracked checkpoint."
|
|
),
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: mit
|
|
library_name: transformers
|
|
pipeline_tag: image-to-text
|
|
tags:
|
|
- captioning
|
|
- line-art
|
|
- document-vision
|
|
---
|
|
|
|
# lineart-caption-base
|
|
|
|
A compact caption model tuned for monochrome line art,
|
|
icon-heavy diagrams, and OCR-adjacent illustrations.
|
|
|
|
## Intended use
|
|
|
|
- draft captions for internal QA dashboards
|
|
- generate quick prompts for reviewers
|
|
- validate frontend metadata rendering
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"config.json",
|
|
json_bytes(
|
|
{
|
|
"architectures": ["VisionEncoderDecoderModel"],
|
|
"decoder_layers": 6,
|
|
"encoder_layers": 12,
|
|
"image_size": 448,
|
|
"model_type": "lineart-caption-base",
|
|
"vocab_size": 32000,
|
|
}
|
|
),
|
|
),
|
|
(
|
|
"tokenizer.json",
|
|
json_bytes(
|
|
{
|
|
"added_tokens": [],
|
|
"normalizer": {"type": "NFKC"},
|
|
"pre_tokenizer": {"type": "Whitespace"},
|
|
"version": "1.0",
|
|
}
|
|
),
|
|
),
|
|
("examples/prompt.txt", text_bytes("Describe the icon, layout, and visible text.")),
|
|
(
|
|
"checkpoints/lineart-caption-base.safetensors",
|
|
lfs_blob("lineart-caption-base"),
|
|
),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add eval notes and release metrics",
|
|
description="Follow-up commit so commit history and file updates are visible in local UI.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: mit
|
|
library_name: transformers
|
|
pipeline_tag: image-to-text
|
|
tags:
|
|
- captioning
|
|
- line-art
|
|
- document-vision
|
|
---
|
|
|
|
# lineart-caption-base
|
|
|
|
A compact caption model tuned for monochrome line art,
|
|
icon-heavy diagrams, and OCR-adjacent illustrations.
|
|
|
|
## Current release
|
|
|
|
- validation CIDEr: 1.38
|
|
- latency target: <120 ms on local A10G
|
|
- known gap: dense legends still need manual review
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"eval/metrics.json",
|
|
json_bytes(
|
|
{
|
|
"cider": 1.38,
|
|
"clip_score": 0.284,
|
|
"latency_ms_p50": 87,
|
|
"latency_ms_p95": 114,
|
|
}
|
|
),
|
|
),
|
|
(
|
|
"docs/training-notes.md",
|
|
text_bytes(
|
|
"""
|
|
# Training Notes
|
|
|
|
- Base corpus: 82k internal line-art render pairs
|
|
- Additional hard negatives: 4k cluttered signage crops
|
|
- Checkpoint exported for small-batch browser smoke tests
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
branch="ablation-notes",
|
|
tag="v0.2.1",
|
|
download_path="checkpoints/lineart-caption-base.safetensors",
|
|
download_sessions=4,
|
|
),
|
|
RepoSeed(
|
|
actor="mai_lin",
|
|
repo_type="dataset",
|
|
namespace="mai_lin",
|
|
name="street-sign-zh-en",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Import bilingual street sign dataset",
|
|
description="Seed a CSV-backed dataset that exercises dataset preview and tree views.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: cc-by-4.0
|
|
task_categories:
|
|
- image-text-to-text
|
|
language:
|
|
- zh
|
|
- en
|
|
pretty_name: Street Sign ZH EN
|
|
---
|
|
|
|
# street-sign-zh-en
|
|
|
|
A small bilingual dataset for OCR-friendly sign translation and
|
|
layout QA. Rows keep the original text, translation, and scene tag.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"data/train.csv",
|
|
csv_bytes(
|
|
(
|
|
("image", "text_zh", "text_en", "scene"),
|
|
("img_0001.png", "\u5317\u4eac\u7ad9", "Beijing Railway Station", "station"),
|
|
("img_0002.png", "\u5c0f\u5fc3\u53f0\u9636", "Watch Your Step", "retail"),
|
|
("img_0003.png", "\u7981\u6b62\u5438\u70df", "No Smoking", "hospital"),
|
|
("img_0004.png", "\u53f3\u8f6c\u8f66\u9053", "Right Turn Only", "road"),
|
|
)
|
|
),
|
|
),
|
|
(
|
|
"data/validation.csv",
|
|
csv_bytes(
|
|
(
|
|
("image", "text_zh", "text_en", "scene"),
|
|
("val_0001.png", "\u51fa\u53e3", "Exit", "mall"),
|
|
("val_0002.png", "\u670d\u52a1\u53f0", "Service Desk", "airport"),
|
|
)
|
|
),
|
|
),
|
|
(
|
|
"metadata/features.json",
|
|
json_bytes(
|
|
{
|
|
"image": "string",
|
|
"text_zh": "string",
|
|
"text_en": "string",
|
|
"scene": "string",
|
|
}
|
|
),
|
|
),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add preview samples for dataset viewer",
|
|
description="Include JSONL samples and notebook notes for local bug reproduction.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: cc-by-4.0
|
|
task_categories:
|
|
- image-text-to-text
|
|
language:
|
|
- zh
|
|
- en
|
|
pretty_name: Street Sign ZH EN
|
|
---
|
|
|
|
# street-sign-zh-en
|
|
|
|
A small bilingual dataset for OCR-friendly sign translation and
|
|
layout QA. Rows keep the original text, translation, and scene tag.
|
|
|
|
## Notes
|
|
|
|
Validation rows intentionally mix transport, retail, and public
|
|
service scenarios so sorting and filtering bugs are easier to spot.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"previews/samples.jsonl",
|
|
jsonl_bytes(
|
|
(
|
|
{
|
|
"image": "img_0001.png",
|
|
"text_zh": "\u5317\u4eac\u7ad9",
|
|
"text_en": "Beijing Railway Station",
|
|
"scene": "station",
|
|
},
|
|
{
|
|
"image": "img_0002.png",
|
|
"text_zh": "\u5c0f\u5fc3\u53f0\u9636",
|
|
"text_en": "Watch Your Step",
|
|
"scene": "retail",
|
|
},
|
|
)
|
|
),
|
|
),
|
|
(
|
|
"notebooks/README.md",
|
|
text_bytes(
|
|
"""
|
|
# Notebook Notes
|
|
|
|
This dataset is intentionally tiny in local dev. The point is to
|
|
exercise preview, pagination, and schema rendering without waiting
|
|
on a large bootstrap import.
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
branch="qa-pass",
|
|
tag="2026-04-demo",
|
|
download_path="data/train.csv",
|
|
download_sessions=8,
|
|
),
|
|
RepoSeed(
|
|
actor="mai_lin",
|
|
repo_type="space",
|
|
namespace="mai_lin",
|
|
name="mai_lin",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Create profile showcase space",
|
|
description="Provide a same-name space so local profile pages render a realistic card.",
|
|
files=profile_space_files(
|
|
"Mai Lin Workspace",
|
|
"Small utilities and pinned demos used for local reproduction.",
|
|
"amber",
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add profile theme preset",
|
|
description="A second commit makes the space history non-empty for UI testing.",
|
|
files=(
|
|
(
|
|
"assets/theme.json",
|
|
json_bytes(
|
|
{
|
|
"accent": "amber",
|
|
"layout": "split",
|
|
"panels": ["repos", "activity", "notes"],
|
|
}
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
RepoSeed(
|
|
actor="mai_lin",
|
|
repo_type="dataset",
|
|
namespace="mai_lin",
|
|
name="internal-evals",
|
|
private=True,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Seed private eval artifacts",
|
|
description="Keep one private user-owned repo for auth and permission checks.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
# internal-evals
|
|
|
|
Private staging area for eval summaries and failure-case review.
|
|
This repo is intentionally private and only accessible to Mai.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"runs/2026-04-15-summary.json",
|
|
json_bytes(
|
|
{
|
|
"caption_regressions": 7,
|
|
"dataset": "street-sign-zh-en",
|
|
"notes": "False positives cluster around mirrored storefront text.",
|
|
}
|
|
),
|
|
),
|
|
(
|
|
"data/failure_cases.jsonl",
|
|
jsonl_bytes(
|
|
(
|
|
{
|
|
"file": "eval_001.png",
|
|
"issue": "mirror_text",
|
|
"severity": "medium",
|
|
},
|
|
{
|
|
"file": "eval_002.png",
|
|
"issue": "crowded_legend",
|
|
"severity": "high",
|
|
},
|
|
)
|
|
),
|
|
),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add reviewer checklist",
|
|
description="Second commit for commit-history coverage on a private repo.",
|
|
files=(
|
|
(
|
|
"notes/reviewer-checklist.md",
|
|
text_bytes(
|
|
"""
|
|
# Reviewer Checklist
|
|
|
|
- confirm sample renders in dataset viewer
|
|
- compare translated text against bilingual CSV rows
|
|
- log UI regressions with the seeded repo name
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
download_path="runs/2026-04-15-summary.json",
|
|
download_sessions=1,
|
|
),
|
|
RepoSeed(
|
|
actor="mai_lin",
|
|
repo_type="space",
|
|
namespace="aurora-labs",
|
|
name="aurora-labs",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Create org showcase space",
|
|
description="Same-name org space keeps organization profile pages representative.",
|
|
files=profile_space_files(
|
|
"Aurora Labs Demo Portal",
|
|
"Landing page for OCR demos, pinned datasets, and release notes.",
|
|
"indigo",
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add roadmap note",
|
|
description="A lightweight follow-up commit for org space history.",
|
|
files=(
|
|
(
|
|
"docs/roadmap.md",
|
|
text_bytes(
|
|
"""
|
|
# Local Demo Roadmap
|
|
|
|
- tighten OCR-lite benchmark reporting
|
|
- keep receipt-layout-bench labels stable for bug repro
|
|
- mirror one private support model for permission testing
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
RepoSeed(
|
|
actor="mai_lin",
|
|
repo_type="model",
|
|
namespace="aurora-labs",
|
|
name="aurora-ocr-lite",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Publish OCR-lite baseline",
|
|
description="Public model repo with LFS checkpoint and readable metadata.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: apache-2.0
|
|
library_name: transformers
|
|
pipeline_tag: image-to-text
|
|
tags:
|
|
- ocr
|
|
- receipts
|
|
- multilingual
|
|
---
|
|
|
|
# aurora-ocr-lite
|
|
|
|
An OCR-focused checkpoint for receipt snippets, payment slips,
|
|
and service counter paperwork.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"config.json",
|
|
json_bytes(
|
|
{
|
|
"backbone": "vit-small-patch16-384",
|
|
"decoder": "bart-base",
|
|
"max_position_embeddings": 512,
|
|
"torch_dtype": "float16",
|
|
}
|
|
),
|
|
),
|
|
(
|
|
"vocab.txt",
|
|
text_bytes(
|
|
"""
|
|
[PAD]
|
|
[UNK]
|
|
total
|
|
subtotal
|
|
tax
|
|
cashier
|
|
paid
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"checkpoints/aurora-ocr-lite.safetensors",
|
|
lfs_blob("aurora-ocr-lite"),
|
|
),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add benchmark export and release notes",
|
|
description="Keep one public org model slightly more active for trending and history views.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: apache-2.0
|
|
library_name: transformers
|
|
pipeline_tag: image-to-text
|
|
tags:
|
|
- ocr
|
|
- receipts
|
|
- multilingual
|
|
---
|
|
|
|
# aurora-ocr-lite
|
|
|
|
An OCR-focused checkpoint for receipt snippets, payment slips,
|
|
and service counter paperwork.
|
|
|
|
## Release notes
|
|
|
|
- reduced hallucinated currency markers on narrow receipt crops
|
|
- added benchmark export used by the admin dashboard smoke tests
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"eval/benchmark.json",
|
|
json_bytes(
|
|
{
|
|
"cer": 0.081,
|
|
"wer": 0.119,
|
|
"latency_ms_p50": 64,
|
|
"latency_ms_p95": 92,
|
|
}
|
|
),
|
|
),
|
|
(
|
|
"scripts/export_notes.md",
|
|
text_bytes(
|
|
"""
|
|
# Export Notes
|
|
|
|
Checkpoint is intentionally small and fake. It only exists so local
|
|
flows hit LFS, quota, and file-tree code paths.
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
branch="benchmark-v2",
|
|
tag="v0.3.0",
|
|
download_path="checkpoints/aurora-ocr-lite.safetensors",
|
|
download_sessions=12,
|
|
),
|
|
RepoSeed(
|
|
actor="leo_park",
|
|
repo_type="dataset",
|
|
namespace="aurora-labs",
|
|
name="receipt-layout-bench",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Create receipt layout benchmark",
|
|
description="Public dataset repo with JSONL splits for dataset preview coverage.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: cc-by-4.0
|
|
pretty_name: Receipt Layout Bench
|
|
task_categories:
|
|
- token-classification
|
|
---
|
|
|
|
# receipt-layout-bench
|
|
|
|
Annotation benchmark for merchant, total, tax, and timestamp spans.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"splits/train.jsonl",
|
|
jsonl_bytes(
|
|
(
|
|
{
|
|
"image": "train_0001.png",
|
|
"merchant": "North Pier Cafe",
|
|
"total": "18.40",
|
|
"currency": "USD",
|
|
},
|
|
{
|
|
"image": "train_0002.png",
|
|
"merchant": "River Town Mart",
|
|
"total": "42.15",
|
|
"currency": "USD",
|
|
},
|
|
)
|
|
),
|
|
),
|
|
(
|
|
"splits/test.jsonl",
|
|
jsonl_bytes(
|
|
(
|
|
{
|
|
"image": "test_0001.png",
|
|
"merchant": "Airport Bento",
|
|
"total": "9.80",
|
|
"currency": "USD",
|
|
},
|
|
{
|
|
"image": "test_0002.png",
|
|
"merchant": "Harbor Books",
|
|
"total": "27.10",
|
|
"currency": "USD",
|
|
},
|
|
)
|
|
),
|
|
),
|
|
(
|
|
"schema/fields.json",
|
|
json_bytes(
|
|
{
|
|
"merchant": "string",
|
|
"total": "string",
|
|
"currency": "string",
|
|
"timestamp": "string",
|
|
}
|
|
),
|
|
),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add annotation guide",
|
|
description="Second dataset commit for history, tree diffing, and docs rendering.",
|
|
files=(
|
|
(
|
|
"docs/annotation-guide.md",
|
|
text_bytes(
|
|
"""
|
|
# Annotation Guide
|
|
|
|
- mark printed totals, not handwritten notes
|
|
- keep currency in a dedicated field
|
|
- preserve merchant spelling from source image
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: cc-by-4.0
|
|
pretty_name: Receipt Layout Bench
|
|
task_categories:
|
|
- token-classification
|
|
---
|
|
|
|
# receipt-layout-bench
|
|
|
|
Annotation benchmark for merchant, total, tax, and timestamp spans.
|
|
|
|
The local seed intentionally mixes neat and messy receipts to cover
|
|
pagination, filters, and table previews.
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
branch="supplier-a-refresh",
|
|
tag="v1.0.0",
|
|
download_path="splits/test.jsonl",
|
|
download_sessions=5,
|
|
),
|
|
RepoSeed(
|
|
actor="mai_lin",
|
|
repo_type="model",
|
|
namespace="aurora-labs",
|
|
name="customer-support-rag",
|
|
private=True,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Seed private support model workspace",
|
|
description="Private org repo for auth-only browsing and settings checks.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
# customer-support-rag
|
|
|
|
Internal-only retrieval and prompt assets for support workflows.
|
|
This repo is private and visible to Aurora Labs members only.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"prompt/system.txt",
|
|
text_bytes(
|
|
"""
|
|
You are a cautious support assistant. Answer only with facts from
|
|
the indexed knowledge base, and cite the exact article title.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"retrieval/index-schema.json",
|
|
json_bytes(
|
|
{
|
|
"article_id": "string",
|
|
"channel": "string",
|
|
"lang": "string",
|
|
"text": "string",
|
|
}
|
|
),
|
|
),
|
|
(
|
|
"config.json",
|
|
json_bytes(
|
|
{
|
|
"chunk_size": 384,
|
|
"embedding_model": "bge-small-en-v1.5",
|
|
"top_k": 6,
|
|
}
|
|
),
|
|
),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add ops runbook",
|
|
description="Keep a second private-org commit for local history inspection.",
|
|
files=(
|
|
(
|
|
"docs/runbook.md",
|
|
text_bytes(
|
|
"""
|
|
# Runbook
|
|
|
|
- refresh embeddings weekly
|
|
- snapshot prompts before frontend demos
|
|
- record regressions against the fixed local seed data
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
download_path="prompt/system.txt",
|
|
download_sessions=1,
|
|
),
|
|
RepoSeed(
|
|
actor="noah_kim",
|
|
repo_type="model",
|
|
namespace="harbor-vision",
|
|
name="marine-seg-small",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Publish marine segmentation starter model",
|
|
description="Public vision model with another fake LFS checkpoint.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: apache-2.0
|
|
pipeline_tag: image-segmentation
|
|
tags:
|
|
- segmentation
|
|
- marine
|
|
- edge
|
|
---
|
|
|
|
# marine-seg-small
|
|
|
|
Compact segmentation model for harbor waterlines, safety zones,
|
|
and dock equipment outlines.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"config.json",
|
|
json_bytes(
|
|
{
|
|
"backbone": "convnext-tiny",
|
|
"classes": ["water", "dock", "vessel", "buoy"],
|
|
"input_size": 512,
|
|
}
|
|
),
|
|
),
|
|
(
|
|
"labels.json",
|
|
json_bytes(
|
|
{
|
|
"0": "water",
|
|
"1": "dock",
|
|
"2": "vessel",
|
|
"3": "buoy",
|
|
}
|
|
),
|
|
),
|
|
(
|
|
"checkpoints/marine-seg-small.safetensors",
|
|
lfs_blob("marine-seg-small"),
|
|
),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add harbor evaluation report",
|
|
description="Second model commit for history and stats coverage.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: apache-2.0
|
|
pipeline_tag: image-segmentation
|
|
tags:
|
|
- segmentation
|
|
- marine
|
|
- edge
|
|
---
|
|
|
|
# marine-seg-small
|
|
|
|
Compact segmentation model for harbor waterlines, safety zones,
|
|
and dock equipment outlines.
|
|
|
|
## Eval highlights
|
|
|
|
- best IoU on waterline masks from overcast camera feeds
|
|
- weaker on stacked cargo edges during dusk
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"eval/coastal-harbor.json",
|
|
json_bytes(
|
|
{
|
|
"iou_dock": 0.84,
|
|
"iou_vessel": 0.79,
|
|
"iou_water": 0.91,
|
|
}
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
branch="saltwater-eval",
|
|
tag="v1.1.0",
|
|
download_path="checkpoints/marine-seg-small.safetensors",
|
|
download_sessions=6,
|
|
),
|
|
RepoSeed(
|
|
actor="noah_kim",
|
|
repo_type="space",
|
|
namespace="harbor-vision",
|
|
name="smoke-test-dashboard",
|
|
private=True,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Create private smoke-test dashboard",
|
|
description="Private org space used for auth and space rendering checks.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
# smoke-test-dashboard
|
|
|
|
Private dashboard for camera ingest smoke tests and deployment sign-off.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"app.py",
|
|
text_bytes(
|
|
"""
|
|
import gradio as gr
|
|
|
|
dashboard = gr.Interface(
|
|
fn=lambda status: f"dashboard status: {status}",
|
|
inputs=gr.Textbox(label="Input"),
|
|
outputs=gr.Textbox(label="Output"),
|
|
title="Smoke Test Dashboard",
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
dashboard.launch()
|
|
"""
|
|
),
|
|
),
|
|
("requirements.txt", text_bytes("gradio>=4.44.0")),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add dashboard notes",
|
|
description="Second private-space commit for browsing stateful history locally.",
|
|
files=(
|
|
(
|
|
"dashboards/README.md",
|
|
text_bytes(
|
|
"""
|
|
# Dashboard Notes
|
|
|
|
Fixed local fixtures are better than random telemetry when the goal
|
|
is to reproduce layout and auth bugs.
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
download_path="README.md",
|
|
download_sessions=1,
|
|
),
|
|
RepoSeed(
|
|
actor="leo_park",
|
|
repo_type="space",
|
|
namespace="leo_park",
|
|
name="formula-checker-lite",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Create public formula checker demo",
|
|
description="Lightweight public space for user profile and space listings.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
# formula-checker-lite
|
|
|
|
Small browser demo that validates spreadsheet-style formulas and
|
|
flags obviously broken references.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"app.py",
|
|
text_bytes(
|
|
"""
|
|
import gradio as gr
|
|
|
|
def validate(expr: str) -> str:
|
|
return "looks valid" if "=" in expr else "missing leading ="
|
|
|
|
demo = gr.Interface(
|
|
fn=validate,
|
|
inputs=gr.Textbox(label="Formula"),
|
|
outputs=gr.Textbox(label="Status"),
|
|
title="Formula Checker Lite",
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch()
|
|
"""
|
|
),
|
|
),
|
|
("requirements.txt", text_bytes("gradio>=4.44.0")),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add preset expressions",
|
|
description="Second commit keeps this user-owned space non-trivial.",
|
|
files=(
|
|
(
|
|
"assets/presets.json",
|
|
json_bytes(
|
|
{
|
|
"valid": "=SUM(A1:A3)",
|
|
"invalid": "SUM(A1:A3)",
|
|
"cross_sheet": "=Sheet2!B4",
|
|
}
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
download_path="README.md",
|
|
download_sessions=2,
|
|
),
|
|
RepoSeed(
|
|
actor="sara_chen",
|
|
repo_type="dataset",
|
|
namespace="sara_chen",
|
|
name="invoice-entities-mini",
|
|
private=False,
|
|
commits=(
|
|
CommitSeed(
|
|
summary="Seed invoice entity dataset",
|
|
description="Public user dataset so profile pages are not empty.",
|
|
files=(
|
|
(
|
|
"README.md",
|
|
text_bytes(
|
|
"""
|
|
---
|
|
license: cc-by-4.0
|
|
pretty_name: Invoice Entities Mini
|
|
task_categories:
|
|
- token-classification
|
|
---
|
|
|
|
# invoice-entities-mini
|
|
|
|
Tiny invoice entity dataset for local schema, preview, and table rendering checks.
|
|
"""
|
|
),
|
|
),
|
|
(
|
|
"data/train.jsonl",
|
|
jsonl_bytes(
|
|
(
|
|
{
|
|
"invoice_id": "inv_1001",
|
|
"vendor": "Blue Harbor Logistics",
|
|
"amount": "1240.00",
|
|
},
|
|
{
|
|
"invoice_id": "inv_1002",
|
|
"vendor": "Northline Design",
|
|
"amount": "315.50",
|
|
},
|
|
)
|
|
),
|
|
),
|
|
(
|
|
"data/test.jsonl",
|
|
jsonl_bytes(
|
|
(
|
|
{
|
|
"invoice_id": "inv_2001",
|
|
"vendor": "River Street Foods",
|
|
"amount": "89.20",
|
|
},
|
|
)
|
|
),
|
|
),
|
|
(
|
|
"schema.json",
|
|
json_bytes(
|
|
{
|
|
"invoice_id": "string",
|
|
"vendor": "string",
|
|
"amount": "string",
|
|
}
|
|
),
|
|
),
|
|
),
|
|
),
|
|
CommitSeed(
|
|
summary="Add notebook notes",
|
|
description="Second public dataset commit for file tree and commit history coverage.",
|
|
files=(
|
|
(
|
|
"notebooks/README.md",
|
|
text_bytes(
|
|
"""
|
|
# Notebook Notes
|
|
|
|
Keep the local seed tiny. If a preview bug shows up here, it is much
|
|
easier to reason about than a random large import.
|
|
"""
|
|
),
|
|
),
|
|
),
|
|
),
|
|
),
|
|
download_path="data/train.jsonl",
|
|
download_sessions=3,
|
|
),
|
|
)
|
|
|
|
|
|
REPO_SEEDS = build_repo_seeds()
|
|
|
|
LIKES: tuple[tuple[str, str, str, str], ...] = (
|
|
("leo_park", "model", "mai_lin", "lineart-caption-base"),
|
|
("leo_park", "dataset", "mai_lin", "street-sign-zh-en"),
|
|
("leo_park", "model", "harbor-vision", "marine-seg-small"),
|
|
("sara_chen", "model", "mai_lin", "lineart-caption-base"),
|
|
("sara_chen", "model", "aurora-labs", "aurora-ocr-lite"),
|
|
("sara_chen", "dataset", "aurora-labs", "receipt-layout-bench"),
|
|
("noah_kim", "model", "aurora-labs", "aurora-ocr-lite"),
|
|
("noah_kim", "dataset", "mai_lin", "street-sign-zh-en"),
|
|
("noah_kim", "space", "leo_park", "formula-checker-lite"),
|
|
("ivy_ops", "model", "mai_lin", "lineart-caption-base"),
|
|
("ivy_ops", "model", "aurora-labs", "aurora-ocr-lite"),
|
|
("ivy_ops", "dataset", "sara_chen", "invoice-entities-mini"),
|
|
("mai_lin", "model", "harbor-vision", "marine-seg-small"),
|
|
("mai_lin", "space", "leo_park", "formula-checker-lite"),
|
|
("mai_lin", "dataset", "aurora-labs", "receipt-layout-bench"),
|
|
)
|
|
|
|
|
|
def account_index() -> dict[str, AccountSeed]:
|
|
return {account.username: account for account in ACCOUNTS}
|
|
|
|
|
|
def repo_slug(repo: RepoSeed) -> str:
|
|
return f"{repo.repo_type}-{repo.namespace}-{repo.name}".replace("/", "-")
|
|
|
|
|
|
def make_avatar_bytes(label: str, background: str, accent: str) -> bytes:
|
|
image = Image.new("RGB", (512, 512), background)
|
|
draw = ImageDraw.Draw(image)
|
|
|
|
draw.rounded_rectangle((48, 48, 464, 464), radius=96, outline=accent, width=16)
|
|
draw.ellipse((120, 120, 392, 392), fill=accent)
|
|
|
|
initials = "".join(part[0].upper() for part in label.replace("-", " ").split()[:2])
|
|
font = ImageFont.load_default()
|
|
text_box = draw.textbbox((0, 0), initials, font=font)
|
|
text_width = text_box[2] - text_box[0]
|
|
text_height = text_box[3] - text_box[1]
|
|
draw.text(
|
|
((512 - text_width) / 2, (512 - text_height) / 2),
|
|
initials,
|
|
fill=background,
|
|
font=font,
|
|
)
|
|
|
|
buffer = io.BytesIO()
|
|
image.save(buffer, format="PNG")
|
|
return buffer.getvalue()
|
|
|
|
|
|
def describe_error(response: httpx.Response) -> str:
|
|
try:
|
|
payload = response.json()
|
|
except Exception:
|
|
payload = response.text
|
|
return f"HTTP {response.status_code}: {payload}"
|
|
|
|
|
|
async def ensure_response(
|
|
response: httpx.Response,
|
|
action: str,
|
|
allowed_statuses: tuple[int, ...] = (200,),
|
|
) -> httpx.Response:
|
|
if response.status_code not in allowed_statuses:
|
|
raise SeedError(f"{action} failed with {describe_error(response)}")
|
|
return response
|
|
|
|
|
|
def url_to_internal_path(url: str) -> str:
|
|
parsed = urlsplit(url)
|
|
path = parsed.path or "/"
|
|
if parsed.query:
|
|
path = f"{path}?{parsed.query}"
|
|
return path
|
|
|
|
|
|
async def user_seed_exists(client: httpx.AsyncClient) -> bool:
|
|
response = await client.get(
|
|
f"/api/users/{PRIMARY_USERNAME}/type",
|
|
params={"fallback": "false"},
|
|
)
|
|
if response.status_code == 404:
|
|
return False
|
|
await ensure_response(response, f"check existing seed for {PRIMARY_USERNAME}")
|
|
return True
|
|
|
|
|
|
async def register_account(client: httpx.AsyncClient, account: AccountSeed) -> None:
|
|
response = await client.post(
|
|
"/api/auth/register",
|
|
json={
|
|
"username": account.username,
|
|
"email": account.email,
|
|
"password": DEFAULT_PASSWORD,
|
|
},
|
|
)
|
|
if response.status_code == 200:
|
|
return
|
|
|
|
if response.status_code == 400:
|
|
message = str(response.json())
|
|
if "exists" in message or "conflicts" in message:
|
|
return
|
|
|
|
raise SeedError(f"register {account.username} failed with {describe_error(response)}")
|
|
|
|
|
|
async def login_account(client: httpx.AsyncClient, account: AccountSeed) -> None:
|
|
response = await client.post(
|
|
"/api/auth/login",
|
|
json={"username": account.username, "password": DEFAULT_PASSWORD},
|
|
)
|
|
await ensure_response(response, f"login {account.username}")
|
|
|
|
if "session_id" not in client.cookies:
|
|
raise SeedError(f"login {account.username} did not set a session cookie")
|
|
|
|
|
|
async def upload_avatar(
|
|
client: httpx.AsyncClient,
|
|
path: str,
|
|
label: str,
|
|
background: str,
|
|
accent: str,
|
|
) -> None:
|
|
response = await client.post(
|
|
path,
|
|
files={
|
|
"file": (
|
|
f"{label}.png",
|
|
make_avatar_bytes(label, background, accent),
|
|
"image/png",
|
|
)
|
|
},
|
|
)
|
|
await ensure_response(response, f"upload avatar for {label}")
|
|
|
|
|
|
async def configure_user_profile(client: httpx.AsyncClient, account: AccountSeed) -> None:
|
|
response = await client.put(
|
|
f"/api/users/{account.username}/settings",
|
|
json={
|
|
"email": account.email,
|
|
"full_name": account.full_name,
|
|
"bio": account.bio,
|
|
"website": account.website,
|
|
"social_media": account.social_media,
|
|
},
|
|
)
|
|
await ensure_response(response, f"update user settings for {account.username}")
|
|
await upload_avatar(
|
|
client,
|
|
f"/api/users/{account.username}/avatar",
|
|
account.username,
|
|
account.avatar_bg,
|
|
account.avatar_accent,
|
|
)
|
|
|
|
|
|
async def create_organization(
|
|
client: httpx.AsyncClient, organization: OrganizationSeed
|
|
) -> None:
|
|
response = await client.post(
|
|
"/org/create",
|
|
json={
|
|
"name": organization.name,
|
|
"description": organization.description,
|
|
},
|
|
)
|
|
if response.status_code == 200:
|
|
return
|
|
|
|
if response.status_code == 400 and "already exists" in str(response.json()):
|
|
return
|
|
|
|
raise SeedError(
|
|
f"create organization {organization.name} failed with {describe_error(response)}"
|
|
)
|
|
|
|
|
|
async def ensure_org_member(
|
|
client: httpx.AsyncClient,
|
|
org_name: str,
|
|
username: str,
|
|
role: str,
|
|
) -> None:
|
|
response = await client.post(
|
|
f"/org/{org_name}/members",
|
|
json={"username": username, "role": role},
|
|
)
|
|
if response.status_code not in (200, 400):
|
|
raise SeedError(
|
|
f"add {username} to {org_name} failed with {describe_error(response)}"
|
|
)
|
|
|
|
# PUT keeps roles deterministic even if the member already existed.
|
|
response = await client.put(
|
|
f"/org/{org_name}/members/{username}",
|
|
json={"role": role},
|
|
)
|
|
await ensure_response(response, f"set role for {username} in {org_name}")
|
|
|
|
|
|
async def configure_organization(
|
|
client: httpx.AsyncClient, organization: OrganizationSeed
|
|
) -> None:
|
|
response = await client.put(
|
|
f"/api/organizations/{organization.name}/settings",
|
|
json={
|
|
"description": organization.description,
|
|
"bio": organization.bio,
|
|
"website": organization.website,
|
|
"social_media": organization.social_media,
|
|
},
|
|
)
|
|
await ensure_response(response, f"update organization settings for {organization.name}")
|
|
await upload_avatar(
|
|
client,
|
|
f"/api/organizations/{organization.name}/avatar",
|
|
organization.name,
|
|
organization.avatar_bg,
|
|
organization.avatar_accent,
|
|
)
|
|
|
|
|
|
async def create_repo(client: httpx.AsyncClient, repo: RepoSeed) -> None:
|
|
payload = {
|
|
"type": repo.repo_type,
|
|
"name": repo.name,
|
|
"private": repo.private,
|
|
}
|
|
if repo.namespace != repo.actor:
|
|
payload["organization"] = repo.namespace
|
|
|
|
response = await client.post("/api/repos/create", json=payload)
|
|
if response.status_code == 200:
|
|
return
|
|
|
|
if response.status_code == 400 and "already exists" in str(response.json()):
|
|
return
|
|
|
|
raise SeedError(f"create repo {repo.namespace}/{repo.name} failed with {describe_error(response)}")
|
|
|
|
|
|
async def upload_lfs_object(
|
|
client: httpx.AsyncClient,
|
|
repo: RepoSeed,
|
|
content: bytes,
|
|
) -> tuple[str, int]:
|
|
oid = hashlib.sha256(content).hexdigest()
|
|
size = len(content)
|
|
|
|
response = await client.post(
|
|
f"/{repo.repo_type}s/{repo.namespace}/{repo.name}.git/info/lfs/objects/batch",
|
|
json={
|
|
"operation": "upload",
|
|
"transfers": ["basic"],
|
|
"objects": [{"oid": oid, "size": size}],
|
|
"hash_algo": "sha256",
|
|
# Local dev uses the frontend base_url publicly, so the seed script rewrites
|
|
# verify URLs back onto the in-process backend transport.
|
|
"is_browser": True,
|
|
},
|
|
)
|
|
await ensure_response(response, f"prepare LFS upload for {repo.namespace}/{repo.name}")
|
|
|
|
batch_data = response.json()
|
|
obj = batch_data["objects"][0]
|
|
if obj.get("error"):
|
|
raise SeedError(f"LFS batch returned an error for {repo.namespace}/{repo.name}: {obj['error']}")
|
|
|
|
upload_action = (obj.get("actions") or {}).get("upload")
|
|
if upload_action:
|
|
upload_headers = upload_action.get("header") or {}
|
|
async with httpx.AsyncClient(follow_redirects=False, timeout=60.0) as network_client:
|
|
upload_response = await network_client.put(
|
|
upload_action["href"],
|
|
content=content,
|
|
headers=upload_headers,
|
|
)
|
|
|
|
if upload_response.status_code not in (200, 201):
|
|
raise SeedError(
|
|
f"LFS upload failed for {repo.namespace}/{repo.name}: "
|
|
f"HTTP {upload_response.status_code} {upload_response.text}"
|
|
)
|
|
|
|
verify_action = (obj.get("actions") or {}).get("verify")
|
|
if verify_action:
|
|
verify_response = await client.post(
|
|
url_to_internal_path(verify_action["href"]),
|
|
json={"oid": oid, "size": size},
|
|
)
|
|
await ensure_response(
|
|
verify_response,
|
|
f"verify LFS upload for {repo.namespace}/{repo.name}",
|
|
)
|
|
|
|
return oid, size
|
|
|
|
|
|
async def commit_files(
|
|
client: httpx.AsyncClient,
|
|
repo: RepoSeed,
|
|
commit: CommitSeed,
|
|
) -> None:
|
|
metadata = []
|
|
payload_by_path = {}
|
|
|
|
for path, content in commit.files:
|
|
sha256 = hashlib.sha256(content).hexdigest()
|
|
metadata.append(
|
|
{
|
|
"path": path,
|
|
"size": len(content),
|
|
"sha256": sha256,
|
|
}
|
|
)
|
|
payload_by_path[path] = content
|
|
|
|
preupload_response = await client.post(
|
|
f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/preupload/main",
|
|
json={"files": metadata},
|
|
)
|
|
await ensure_response(
|
|
preupload_response,
|
|
f"preupload {repo.namespace}/{repo.name}",
|
|
)
|
|
preupload_results = {
|
|
item["path"]: item for item in preupload_response.json().get("files", [])
|
|
}
|
|
|
|
ndjson_lines = [
|
|
{
|
|
"key": "header",
|
|
"value": {
|
|
"summary": commit.summary,
|
|
"description": commit.description,
|
|
},
|
|
}
|
|
]
|
|
|
|
for path, content in commit.files:
|
|
mode = preupload_results[path]["uploadMode"]
|
|
|
|
if preupload_results[path]["shouldIgnore"]:
|
|
continue
|
|
|
|
if mode == "lfs":
|
|
oid, size = await upload_lfs_object(client, repo, content)
|
|
ndjson_lines.append(
|
|
{
|
|
"key": "lfsFile",
|
|
"value": {
|
|
"path": path,
|
|
"oid": oid,
|
|
"size": size,
|
|
"algo": "sha256",
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
ndjson_lines.append(
|
|
{
|
|
"key": "file",
|
|
"value": {
|
|
"path": path,
|
|
"content": base64.b64encode(content).decode("ascii"),
|
|
"encoding": "base64",
|
|
},
|
|
}
|
|
)
|
|
|
|
ndjson_payload = "\n".join(json.dumps(line, sort_keys=True) for line in ndjson_lines)
|
|
response = await client.post(
|
|
f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/commit/main",
|
|
content=ndjson_payload,
|
|
headers={"Content-Type": "application/x-ndjson"},
|
|
)
|
|
await ensure_response(response, f"commit {repo.namespace}/{repo.name}")
|
|
|
|
|
|
async def create_branch(client: httpx.AsyncClient, repo: RepoSeed) -> None:
|
|
if not repo.branch:
|
|
return
|
|
|
|
response = await client.post(
|
|
f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/branch",
|
|
json={"branch": repo.branch, "revision": "main"},
|
|
)
|
|
if response.status_code == 200:
|
|
return
|
|
|
|
if response.status_code in (400, 409) and "already exists" in str(response.json()):
|
|
return
|
|
|
|
raise SeedError(
|
|
f"create branch {repo.branch} for {repo.namespace}/{repo.name} failed with "
|
|
f"{describe_error(response)}"
|
|
)
|
|
|
|
|
|
async def create_tag(client: httpx.AsyncClient, repo: RepoSeed) -> None:
|
|
if not repo.tag:
|
|
return
|
|
|
|
response = await client.post(
|
|
f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/tag",
|
|
json={"tag": repo.tag, "revision": "main"},
|
|
)
|
|
if response.status_code == 200:
|
|
return
|
|
|
|
if response.status_code in (400, 409) and "already exists" in str(response.json()):
|
|
return
|
|
|
|
raise SeedError(
|
|
f"create tag {repo.tag} for {repo.namespace}/{repo.name} failed with "
|
|
f"{describe_error(response)}"
|
|
)
|
|
|
|
|
|
async def like_repo(
|
|
client: httpx.AsyncClient,
|
|
repo_type: str,
|
|
namespace: str,
|
|
name: str,
|
|
) -> None:
|
|
response = await client.post(f"/api/{repo_type}s/{namespace}/{name}/like")
|
|
if response.status_code == 200:
|
|
return
|
|
|
|
if response.status_code == 400 and "already liked" in str(response.json()):
|
|
return
|
|
|
|
raise SeedError(
|
|
f"like {repo_type}/{namespace}/{name} failed with {describe_error(response)}"
|
|
)
|
|
|
|
|
|
async def trigger_download(
|
|
client: httpx.AsyncClient,
|
|
repo: RepoSeed,
|
|
path: str,
|
|
*,
|
|
cookies: dict[str, str] | None = None,
|
|
) -> None:
|
|
response = await client.get(
|
|
f"/api/{repo.repo_type}s/{repo.namespace}/{repo.name}/resolve/main/{path}",
|
|
cookies=cookies,
|
|
)
|
|
if response.status_code not in (302, 307):
|
|
raise SeedError(
|
|
f"download seed for {repo.namespace}/{repo.name}:{path} failed with "
|
|
f"{describe_error(response)}"
|
|
)
|
|
|
|
|
|
def build_manifest() -> dict:
|
|
return {
|
|
"seed_version": SEED_VERSION,
|
|
"manifest_path": str(MANIFEST_PATH),
|
|
"main_ui_url": cfg.app.base_url,
|
|
"backend_url": INTERNAL_BASE_URL,
|
|
"main_login": {
|
|
"username": PRIMARY_USERNAME,
|
|
"password": DEFAULT_PASSWORD,
|
|
},
|
|
"additional_users": [
|
|
{
|
|
"username": account.username,
|
|
"password": DEFAULT_PASSWORD,
|
|
"email": account.email,
|
|
}
|
|
for account in ACCOUNTS
|
|
if account.username != PRIMARY_USERNAME
|
|
],
|
|
"admin_ui": {
|
|
"url": "http://127.0.0.1:5174",
|
|
"token": cfg.admin.secret_token,
|
|
},
|
|
"organizations": [
|
|
{
|
|
"name": organization.name,
|
|
"members": [
|
|
{"username": username, "role": role}
|
|
for username, role in organization.members
|
|
],
|
|
}
|
|
for organization in ORGANIZATIONS
|
|
],
|
|
"repositories": [
|
|
{
|
|
"type": repo.repo_type,
|
|
"namespace": repo.namespace,
|
|
"name": repo.name,
|
|
"private": repo.private,
|
|
}
|
|
for repo in REPO_SEEDS
|
|
],
|
|
}
|
|
|
|
|
|
def write_manifest() -> None:
|
|
MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
MANIFEST_PATH.write_text(
|
|
json.dumps(build_manifest(), indent=2, sort_keys=True) + "\n",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def print_summary(seed_applied: bool) -> None:
|
|
state = "Seeded" if seed_applied else "Seed already present"
|
|
print(f"{state}: {SEED_VERSION}")
|
|
print(f"Manifest: {MANIFEST_PATH}")
|
|
print(f"Main UI: {cfg.app.base_url}")
|
|
print(f"Backend: {INTERNAL_BASE_URL}")
|
|
print(f"Login: {PRIMARY_USERNAME} / {DEFAULT_PASSWORD}")
|
|
print(f"Admin UI token: {cfg.admin.secret_token}")
|
|
|
|
|
|
async def seed_demo_data() -> None:
|
|
init_storage()
|
|
transport = httpx.ASGITransport(app=app)
|
|
accounts_by_name = account_index()
|
|
|
|
async with AsyncExitStack() as stack:
|
|
seed_client = await stack.enter_async_context(
|
|
httpx.AsyncClient(
|
|
transport=transport,
|
|
base_url=INTERNAL_BASE_URL,
|
|
follow_redirects=False,
|
|
)
|
|
)
|
|
|
|
if await user_seed_exists(seed_client):
|
|
write_manifest()
|
|
print_summary(seed_applied=False)
|
|
return
|
|
|
|
for account in ACCOUNTS:
|
|
await register_account(seed_client, account)
|
|
|
|
authed_clients: dict[str, httpx.AsyncClient] = {}
|
|
for account in ACCOUNTS:
|
|
client = await stack.enter_async_context(
|
|
httpx.AsyncClient(
|
|
transport=transport,
|
|
base_url=INTERNAL_BASE_URL,
|
|
follow_redirects=False,
|
|
)
|
|
)
|
|
await login_account(client, account)
|
|
await configure_user_profile(client, account)
|
|
authed_clients[account.username] = client
|
|
|
|
primary_client = authed_clients[PRIMARY_USERNAME]
|
|
for organization in ORGANIZATIONS:
|
|
await create_organization(primary_client, organization)
|
|
for username, role in organization.members:
|
|
if username == PRIMARY_USERNAME:
|
|
continue
|
|
await ensure_org_member(primary_client, organization.name, username, role)
|
|
await configure_organization(primary_client, organization)
|
|
|
|
for repo in REPO_SEEDS:
|
|
repo_client = authed_clients[repo.actor]
|
|
await create_repo(repo_client, repo)
|
|
for commit in repo.commits:
|
|
await commit_files(repo_client, repo, commit)
|
|
await create_branch(repo_client, repo)
|
|
await create_tag(repo_client, repo)
|
|
|
|
for liker, repo_type, namespace, name in LIKES:
|
|
await like_repo(authed_clients[liker], repo_type, namespace, name)
|
|
|
|
anon_client = await stack.enter_async_context(
|
|
httpx.AsyncClient(
|
|
transport=transport,
|
|
base_url=INTERNAL_BASE_URL,
|
|
follow_redirects=False,
|
|
)
|
|
)
|
|
|
|
for repo in REPO_SEEDS:
|
|
if not repo.download_path:
|
|
continue
|
|
|
|
if repo.private:
|
|
await trigger_download(
|
|
authed_clients[PRIMARY_USERNAME],
|
|
repo,
|
|
repo.download_path,
|
|
)
|
|
continue
|
|
|
|
for session_number in range(repo.download_sessions):
|
|
await trigger_download(
|
|
anon_client,
|
|
repo,
|
|
repo.download_path,
|
|
cookies={
|
|
"hf_download_session": f"seed-{repo_slug(repo)}-{session_number:02d}"
|
|
},
|
|
)
|
|
|
|
# Download tracking happens in background tasks off the API response path.
|
|
await asyncio.sleep(0.5)
|
|
|
|
write_manifest()
|
|
print_summary(seed_applied=True)
|
|
|
|
|
|
def main() -> int:
|
|
try:
|
|
asyncio.run(seed_demo_data())
|
|
except SeedError as exc:
|
|
print(f"Seed failed: {exc}", file=sys.stderr)
|
|
return 1
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|