mirror of
https://github.com/KohakuBlueleaf/KohakuHub.git
synced 2026-04-28 01:57:14 -05:00
Add version API, add lfs versioning/gc
This commit is contained in:
@@ -38,3 +38,5 @@ debug_log_payloads = true
|
||||
# LFS Garbage Collection settings
|
||||
lfs_keep_versions = 5 # Keep last K versions of each LFS file
|
||||
lfs_auto_gc = true # Automatically delete old LFS objects on commit
|
||||
# Site identification
|
||||
site_name = "KohakuHub" # Customizable site name (e.g., "MyCompany Hub")
|
||||
|
||||
@@ -42,6 +42,10 @@ services:
|
||||
- KOHAKU_HUB_BASE_URL=http://127.0.0.1:28080 # Web UI will proxy requests to hub-api, we use hub-ui url here
|
||||
- KOHAKU_HUB_DB_BACKEND=postgres
|
||||
- KOHAKU_HUB_DATABASE_URL=postgresql://hub:hubpass@postgres:5432/hubdb # Linking to the 'postgres' service
|
||||
- KOHAKU_HUB_LFS_THRESHOLD_BYTES=1000000
|
||||
- KOHAKU_HUB_LFS_KEEP_VERSIONS=5
|
||||
- KOHAKU_HUB_LFS_AUTO_GC=true
|
||||
- KOHAKU_HUB_SITE_NAME=KohakuHub
|
||||
## SMTP Configuration (Email Verification - Optional)
|
||||
- KOHAKU_HUB_SMTP_ENABLED=false
|
||||
- KOHAKU_HUB_SMTP_HOST=smtp.gmail.com
|
||||
|
||||
@@ -294,10 +294,22 @@
|
||||
</p>
|
||||
</div>
|
||||
<template v-else>
|
||||
<!-- Header Row (desktop only) -->
|
||||
<div
|
||||
class="hidden md:grid md:grid-cols-[auto_1fr_120px_150px] gap-3 py-2 px-2 text-sm font-medium text-gray-600 dark:text-gray-400 border-b"
|
||||
>
|
||||
<div></div>
|
||||
<!-- Icon column -->
|
||||
<div>Name</div>
|
||||
<div class="text-right">Size</div>
|
||||
<div class="text-right">Last Modified</div>
|
||||
</div>
|
||||
|
||||
<!-- File Rows -->
|
||||
<div
|
||||
v-for="file in filteredFiles"
|
||||
:key="file.path"
|
||||
class="py-3 flex items-center gap-3 hover:bg-gray-50 dark:hover:bg-gray-700 px-2 cursor-pointer transition-colors"
|
||||
class="py-3 grid grid-cols-[auto_1fr_auto] md:grid-cols-[auto_1fr_120px_150px] gap-3 items-center hover:bg-gray-50 dark:hover:bg-gray-700 px-2 cursor-pointer transition-colors"
|
||||
@click="handleFileClick(file)"
|
||||
>
|
||||
<div
|
||||
@@ -308,14 +320,21 @@
|
||||
"
|
||||
class="text-xl flex-shrink-0"
|
||||
/>
|
||||
<div class="flex-1 min-w-0">
|
||||
<div class="min-w-0">
|
||||
<div class="font-medium truncate">
|
||||
{{ getFileName(file.path) }}
|
||||
</div>
|
||||
</div>
|
||||
<div class="text-sm text-gray-500 dark:text-gray-400">
|
||||
<div
|
||||
class="text-sm text-gray-500 dark:text-gray-400 text-right"
|
||||
>
|
||||
{{ formatSize(file.size) }}
|
||||
</div>
|
||||
<div
|
||||
class="hidden md:block text-sm text-gray-500 dark:text-gray-400 text-right"
|
||||
>
|
||||
{{ formatLastModified(file.lastModified) }}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div
|
||||
@@ -433,6 +452,10 @@
|
||||
<span class="text-gray-600 dark:text-gray-400">Created:</span>
|
||||
<span class="ml-2">{{ formatDate(repoInfo?.createdAt) }}</span>
|
||||
</div>
|
||||
<div v-if="repoInfo?.lastModified">
|
||||
<span class="text-gray-600 dark:text-gray-400">Updated:</span>
|
||||
<span class="ml-2">{{ formatDate(repoInfo?.lastModified) }}</span>
|
||||
</div>
|
||||
<div v-if="repoInfo?.sha">
|
||||
<span class="text-gray-600 dark:text-gray-400">Commit:</span>
|
||||
<span class="ml-2 font-mono text-xs">{{
|
||||
@@ -551,6 +574,7 @@ const pathSegments = computed(() => {
|
||||
});
|
||||
|
||||
const filteredFiles = computed(() => {
|
||||
// Backend now provides folder stats, so just filter
|
||||
if (!fileSearchQuery.value) return fileTree.value;
|
||||
|
||||
const query = fileSearchQuery.value.toLowerCase();
|
||||
@@ -582,6 +606,15 @@ function formatSize(bytes) {
|
||||
return (bytes / (1024 * 1024 * 1024)).toFixed(1) + " GB";
|
||||
}
|
||||
|
||||
function formatLastModified(dateString) {
|
||||
if (!dateString) return "-";
|
||||
try {
|
||||
return dayjs(dateString).fromNow();
|
||||
} catch (e) {
|
||||
return "-";
|
||||
}
|
||||
}
|
||||
|
||||
function getFileName(path) {
|
||||
const parts = path.split("/");
|
||||
return parts[parts.length - 1] || path;
|
||||
|
||||
@@ -468,17 +468,53 @@ async def list_repo_tree(
|
||||
# Remove prefix from path to get relative path
|
||||
relative_path = obj.path[prefix_len:] if prefix else obj.path
|
||||
|
||||
# Calculate folder stats by listing its contents recursively
|
||||
folder_size = 0
|
||||
folder_latest_mtime = None
|
||||
|
||||
try:
|
||||
# List all objects in this folder recursively
|
||||
folder_contents = await async_client.list_objects(
|
||||
repository=lakefs_repo,
|
||||
ref=revision,
|
||||
prefix=obj.path, # Use full path as prefix
|
||||
delimiter="", # No delimiter = recursive
|
||||
amount=1000,
|
||||
)
|
||||
|
||||
# Calculate total size and find latest modification
|
||||
for child_obj in folder_contents.results:
|
||||
if child_obj.path_type == "object":
|
||||
folder_size += child_obj.size_bytes or 0
|
||||
if hasattr(child_obj, "mtime") and child_obj.mtime:
|
||||
if (
|
||||
folder_latest_mtime is None
|
||||
or child_obj.mtime > folder_latest_mtime
|
||||
):
|
||||
folder_latest_mtime = child_obj.mtime
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
f"Could not calculate stats for folder {obj.path}: {str(e)}"
|
||||
)
|
||||
|
||||
dir_obj = {
|
||||
"type": "directory",
|
||||
"oid": (
|
||||
obj.checksum if hasattr(obj, "checksum") and obj.checksum else ""
|
||||
),
|
||||
"size": 0,
|
||||
"size": folder_size,
|
||||
"path": relative_path.rstrip("/"), # Remove trailing slash
|
||||
}
|
||||
|
||||
# Add last modified info if available
|
||||
if hasattr(obj, "mtime") and obj.mtime:
|
||||
# Add last modified info
|
||||
if folder_latest_mtime:
|
||||
from datetime import datetime
|
||||
|
||||
dir_obj["lastModified"] = datetime.fromtimestamp(
|
||||
folder_latest_mtime
|
||||
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
elif hasattr(obj, "mtime") and obj.mtime:
|
||||
from datetime import datetime
|
||||
|
||||
dir_obj["lastModified"] = datetime.fromtimestamp(obj.mtime).strftime(
|
||||
|
||||
@@ -509,6 +509,7 @@ async def commit(
|
||||
|
||||
# Process operations
|
||||
files_changed = False # Track if any files actually changed
|
||||
pending_lfs_tracking = [] # Track LFS objects to record in history after commit
|
||||
|
||||
for op in operations:
|
||||
key = op["key"]
|
||||
@@ -698,35 +699,15 @@ async def commit(
|
||||
|
||||
logger.success(f"Updated database record for LFS file: {path}")
|
||||
|
||||
# NOW delete the old LFS object if it exists and is not used elsewhere
|
||||
if old_lfs_oid:
|
||||
# Check if this OID is still used by other files (deduplication check)
|
||||
other_uses = (
|
||||
File.select()
|
||||
.where((File.sha256 == old_lfs_oid) & (File.lfs == True))
|
||||
.count()
|
||||
)
|
||||
|
||||
if other_uses == 0:
|
||||
# Safe to delete - not used anywhere
|
||||
old_lfs_key = (
|
||||
f"lfs/{old_lfs_oid[:2]}/{old_lfs_oid[2:4]}/{old_lfs_oid}"
|
||||
)
|
||||
try:
|
||||
from .s3_utils import get_s3_client
|
||||
|
||||
s3_client = get_s3_client()
|
||||
s3_client.delete_object(Bucket=cfg.s3.bucket, Key=old_lfs_key)
|
||||
logger.success(f"Deleted old LFS object: {old_lfs_key}")
|
||||
except Exception as e:
|
||||
# Log but don't fail - the new file is already linked successfully
|
||||
logger.warning(
|
||||
f"Failed to delete old LFS object {old_lfs_key}: {e}"
|
||||
)
|
||||
else:
|
||||
logger.success(
|
||||
f"Keeping old LFS object {old_lfs_oid} - still used by {other_uses} file(s)"
|
||||
)
|
||||
# Track this LFS object for GC after commit
|
||||
pending_lfs_tracking.append(
|
||||
{
|
||||
"path": path,
|
||||
"sha256": oid,
|
||||
"size": size,
|
||||
"old_sha256": old_lfs_oid,
|
||||
}
|
||||
)
|
||||
|
||||
elif key == "deletedFile":
|
||||
# Delete a single file
|
||||
@@ -967,6 +948,32 @@ async def commit(
|
||||
commit_url = f"{cfg.app.base_url}/{repo_id}/commit/{commit_result.id}"
|
||||
logger.success(f"Commit URL: {commit_url}")
|
||||
|
||||
# Now that we have commit_id, track LFS objects and run GC
|
||||
if pending_lfs_tracking:
|
||||
from .gc_utils import track_lfs_object, run_gc_for_file
|
||||
|
||||
for lfs_info in pending_lfs_tracking:
|
||||
# Track the new LFS object in history
|
||||
track_lfs_object(
|
||||
repo_full_id=repo_id,
|
||||
path_in_repo=lfs_info["path"],
|
||||
sha256=lfs_info["sha256"],
|
||||
size=lfs_info["size"],
|
||||
commit_id=commit_result.id,
|
||||
)
|
||||
|
||||
# Run GC for this file if enabled
|
||||
if cfg.app.lfs_auto_gc and lfs_info.get("old_sha256"):
|
||||
deleted_count = run_gc_for_file(
|
||||
repo_full_id=repo_id,
|
||||
path_in_repo=lfs_info["path"],
|
||||
current_commit_id=commit_result.id,
|
||||
)
|
||||
if deleted_count > 0:
|
||||
logger.info(
|
||||
f"GC: Cleaned up {deleted_count} old version(s) of {lfs_info['path']}"
|
||||
)
|
||||
|
||||
return {
|
||||
"commitUrl": commit_url,
|
||||
"commitOid": commit_result.id,
|
||||
|
||||
201
src/kohakuhub/api/gc_utils.py
Normal file
201
src/kohakuhub/api/gc_utils.py
Normal file
@@ -0,0 +1,201 @@
|
||||
"""Garbage collection utilities for LFS objects."""
|
||||
|
||||
from typing import List, Optional
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..db import LFSObjectHistory, File
|
||||
from ..config import cfg
|
||||
from ..logger import get_logger
|
||||
|
||||
logger = get_logger("GC")
|
||||
|
||||
|
||||
def track_lfs_object(
|
||||
repo_full_id: str,
|
||||
path_in_repo: str,
|
||||
sha256: str,
|
||||
size: int,
|
||||
commit_id: str,
|
||||
):
|
||||
"""Track LFS object usage in a commit.
|
||||
|
||||
Args:
|
||||
repo_full_id: Full repository ID (namespace/name)
|
||||
path_in_repo: File path in repository
|
||||
sha256: LFS object SHA256 hash
|
||||
size: Object size in bytes
|
||||
commit_id: LakeFS commit ID
|
||||
"""
|
||||
LFSObjectHistory.create(
|
||||
repo_full_id=repo_full_id,
|
||||
path_in_repo=path_in_repo,
|
||||
sha256=sha256,
|
||||
size=size,
|
||||
commit_id=commit_id,
|
||||
)
|
||||
logger.debug(
|
||||
f"Tracked LFS object {sha256[:8]} for {path_in_repo} in commit {commit_id[:8]}"
|
||||
)
|
||||
|
||||
|
||||
def get_old_lfs_versions(
|
||||
repo_full_id: str,
|
||||
path_in_repo: str,
|
||||
keep_count: int,
|
||||
) -> List[str]:
|
||||
"""Get old LFS object hashes that should be garbage collected.
|
||||
|
||||
Args:
|
||||
repo_full_id: Full repository ID
|
||||
path_in_repo: File path
|
||||
keep_count: Number of versions to keep
|
||||
|
||||
Returns:
|
||||
List of SHA256 hashes to delete
|
||||
"""
|
||||
# Get all historical versions for this file, sorted by creation date (newest first)
|
||||
history = (
|
||||
LFSObjectHistory.select()
|
||||
.where(
|
||||
(LFSObjectHistory.repo_full_id == repo_full_id)
|
||||
& (LFSObjectHistory.path_in_repo == path_in_repo)
|
||||
)
|
||||
.order_by(LFSObjectHistory.created_at.desc())
|
||||
)
|
||||
|
||||
all_versions = list(history)
|
||||
|
||||
if len(all_versions) <= keep_count:
|
||||
# Not enough versions to trigger GC
|
||||
logger.debug(
|
||||
f"Only {len(all_versions)} versions of {path_in_repo}, keeping all"
|
||||
)
|
||||
return []
|
||||
|
||||
# Keep the newest K versions, delete the rest
|
||||
versions_to_keep = all_versions[:keep_count]
|
||||
versions_to_delete = all_versions[keep_count:]
|
||||
|
||||
keep_hashes = {v.sha256 for v in versions_to_keep}
|
||||
delete_hashes = []
|
||||
|
||||
for old_version in versions_to_delete:
|
||||
# Only delete if not in the "keep" set (shouldn't happen, but safety check)
|
||||
if old_version.sha256 not in keep_hashes:
|
||||
delete_hashes.append(old_version.sha256)
|
||||
|
||||
logger.info(
|
||||
f"GC for {path_in_repo}: keeping {len(versions_to_keep)} versions, "
|
||||
f"marking {len(delete_hashes)} for deletion"
|
||||
)
|
||||
|
||||
return delete_hashes
|
||||
|
||||
|
||||
def cleanup_lfs_object(sha256: str, repo_full_id: Optional[str] = None) -> bool:
|
||||
"""Delete an LFS object from S3 if it's not used anywhere.
|
||||
|
||||
Args:
|
||||
sha256: LFS object hash
|
||||
repo_full_id: Optional - restrict check to specific repo
|
||||
|
||||
Returns:
|
||||
True if deleted, False if still in use or deletion failed
|
||||
"""
|
||||
# Check if this object is still referenced in current files
|
||||
query = File.select().where((File.sha256 == sha256) & (File.lfs == True))
|
||||
if repo_full_id:
|
||||
query = query.where(File.repo_full_id == repo_full_id)
|
||||
|
||||
current_uses = query.count()
|
||||
|
||||
if current_uses > 0:
|
||||
logger.debug(
|
||||
f"LFS object {sha256[:8]} still used by {current_uses} file(s), keeping"
|
||||
)
|
||||
return False
|
||||
|
||||
# Check if this object is referenced in any commit history (other repos might use it)
|
||||
if not repo_full_id:
|
||||
# Global check across all repos
|
||||
history_uses = (
|
||||
LFSObjectHistory.select().where(LFSObjectHistory.sha256 == sha256).count()
|
||||
)
|
||||
|
||||
if history_uses > 0:
|
||||
logger.debug(
|
||||
f"LFS object {sha256[:8]} in history ({history_uses} references), keeping"
|
||||
)
|
||||
return False
|
||||
|
||||
# Safe to delete from S3
|
||||
try:
|
||||
from .s3_utils import get_s3_client
|
||||
|
||||
lfs_key = f"lfs/{sha256[:2]}/{sha256[2:4]}/{sha256}"
|
||||
s3_client = get_s3_client()
|
||||
s3_client.delete_object(Bucket=cfg.s3.bucket, Key=lfs_key)
|
||||
|
||||
logger.success(f"Deleted LFS object from S3: {lfs_key}")
|
||||
|
||||
# Remove from history table
|
||||
if repo_full_id:
|
||||
deleted_count = (
|
||||
LFSObjectHistory.delete()
|
||||
.where(
|
||||
(LFSObjectHistory.repo_full_id == repo_full_id)
|
||||
& (LFSObjectHistory.sha256 == sha256)
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
else:
|
||||
deleted_count = (
|
||||
LFSObjectHistory.delete()
|
||||
.where(LFSObjectHistory.sha256 == sha256)
|
||||
.execute()
|
||||
)
|
||||
|
||||
logger.info(f"Removed {deleted_count} history records for {sha256[:8]}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to delete LFS object {sha256[:8]}", e)
|
||||
return False
|
||||
|
||||
|
||||
def run_gc_for_file(
|
||||
repo_full_id: str,
|
||||
path_in_repo: str,
|
||||
current_commit_id: str,
|
||||
) -> int:
|
||||
"""Run garbage collection for a specific file.
|
||||
|
||||
Args:
|
||||
repo_full_id: Full repository ID
|
||||
path_in_repo: File path
|
||||
current_commit_id: Current commit ID
|
||||
|
||||
Returns:
|
||||
Number of objects deleted
|
||||
"""
|
||||
if not cfg.app.lfs_auto_gc:
|
||||
logger.debug("Auto GC disabled, skipping")
|
||||
return 0
|
||||
|
||||
keep_count = cfg.app.lfs_keep_versions
|
||||
old_hashes = get_old_lfs_versions(repo_full_id, path_in_repo, keep_count)
|
||||
|
||||
if not old_hashes:
|
||||
return 0
|
||||
|
||||
deleted_count = 0
|
||||
for sha256 in old_hashes:
|
||||
if cleanup_lfs_object(sha256, repo_full_id):
|
||||
deleted_count += 1
|
||||
|
||||
if deleted_count > 0:
|
||||
logger.success(
|
||||
f"GC completed for {path_in_repo}: deleted {deleted_count} old version(s)"
|
||||
)
|
||||
|
||||
return deleted_count
|
||||
@@ -26,9 +26,12 @@ def get_version():
|
||||
Returns:
|
||||
Site identification and version information
|
||||
"""
|
||||
from ..config import cfg
|
||||
|
||||
return {
|
||||
"site": "kohakuhub",
|
||||
"version": "0.0.1",
|
||||
"name": cfg.app.site_name,
|
||||
}
|
||||
|
||||
|
||||
@@ -86,6 +89,8 @@ def whoami_v2(user: User = Depends(get_optional_user)):
|
||||
}
|
||||
)
|
||||
|
||||
from ..config import cfg
|
||||
|
||||
return {
|
||||
"type": "user",
|
||||
"id": str(user.id),
|
||||
@@ -100,4 +105,10 @@ def whoami_v2(user: User = Depends(get_optional_user)):
|
||||
"type": "access_token",
|
||||
"accessToken": {"displayName": "Auto-generated token", "role": "write"},
|
||||
},
|
||||
# KohakuHub-specific fields
|
||||
"site": {
|
||||
"name": cfg.app.site_name, # Configurable site name
|
||||
"api": "kohakuhub", # Hardcoded API identifier
|
||||
"version": "0.0.1", # Hardcoded version
|
||||
},
|
||||
}
|
||||
|
||||
@@ -52,6 +52,8 @@ class AppConfig(BaseModel):
|
||||
# LFS Garbage Collection settings
|
||||
lfs_keep_versions: int = 5 # Keep last K versions of each file
|
||||
lfs_auto_gc: bool = False # Auto-delete old LFS objects on commit
|
||||
# Site identification
|
||||
site_name: str = "KohakuHub" # Configurable site name (e.g., "MyCompany Hub")
|
||||
|
||||
|
||||
class Config(BaseModel):
|
||||
@@ -117,6 +119,10 @@ def load_config(path: str = None) -> Config:
|
||||
lfs_threshold_bytes=int(
|
||||
os.environ.get("KOHAKU_HUB_LFS_THRESHOLD_BYTES", "10000000")
|
||||
),
|
||||
lfs_keep_versions=int(os.environ.get("KOHAKU_HUB_LFS_KEEP_VERSIONS", "5")),
|
||||
lfs_auto_gc=os.environ.get("KOHAKU_HUB_LFS_AUTO_GC", "false").lower()
|
||||
== "true",
|
||||
site_name=os.environ.get("KOHAKU_HUB_SITE_NAME", "KohakuHub"),
|
||||
)
|
||||
|
||||
return Config(
|
||||
|
||||
Reference in New Issue
Block a user