Add version API, add lfs versioning/gc

This commit is contained in:
Kohaku-Blueleaf
2025-10-05 18:15:11 +08:00
parent d340743897
commit 6cc6abbf2d
8 changed files with 335 additions and 35 deletions

View File

@@ -38,3 +38,5 @@ debug_log_payloads = true
# LFS Garbage Collection settings
lfs_keep_versions = 5 # Keep last K versions of each LFS file
lfs_auto_gc = true # Automatically delete old LFS objects on commit
# Site identification
site_name = "KohakuHub" # Customizable site name (e.g., "MyCompany Hub")

View File

@@ -42,6 +42,10 @@ services:
- KOHAKU_HUB_BASE_URL=http://127.0.0.1:28080 # Web UI will proxy requests to hub-api, we use hub-ui url here
- KOHAKU_HUB_DB_BACKEND=postgres
- KOHAKU_HUB_DATABASE_URL=postgresql://hub:hubpass@postgres:5432/hubdb # Linking to the 'postgres' service
- KOHAKU_HUB_LFS_THRESHOLD_BYTES=1000000
- KOHAKU_HUB_LFS_KEEP_VERSIONS=5
- KOHAKU_HUB_LFS_AUTO_GC=true
- KOHAKU_HUB_SITE_NAME=KohakuHub
## SMTP Configuration (Email Verification - Optional)
- KOHAKU_HUB_SMTP_ENABLED=false
- KOHAKU_HUB_SMTP_HOST=smtp.gmail.com

View File

@@ -294,10 +294,22 @@
</p>
</div>
<template v-else>
<!-- Header Row (desktop only) -->
<div
class="hidden md:grid md:grid-cols-[auto_1fr_120px_150px] gap-3 py-2 px-2 text-sm font-medium text-gray-600 dark:text-gray-400 border-b"
>
<div></div>
<!-- Icon column -->
<div>Name</div>
<div class="text-right">Size</div>
<div class="text-right">Last Modified</div>
</div>
<!-- File Rows -->
<div
v-for="file in filteredFiles"
:key="file.path"
class="py-3 flex items-center gap-3 hover:bg-gray-50 dark:hover:bg-gray-700 px-2 cursor-pointer transition-colors"
class="py-3 grid grid-cols-[auto_1fr_auto] md:grid-cols-[auto_1fr_120px_150px] gap-3 items-center hover:bg-gray-50 dark:hover:bg-gray-700 px-2 cursor-pointer transition-colors"
@click="handleFileClick(file)"
>
<div
@@ -308,14 +320,21 @@
"
class="text-xl flex-shrink-0"
/>
<div class="flex-1 min-w-0">
<div class="min-w-0">
<div class="font-medium truncate">
{{ getFileName(file.path) }}
</div>
</div>
<div class="text-sm text-gray-500 dark:text-gray-400">
<div
class="text-sm text-gray-500 dark:text-gray-400 text-right"
>
{{ formatSize(file.size) }}
</div>
<div
class="hidden md:block text-sm text-gray-500 dark:text-gray-400 text-right"
>
{{ formatLastModified(file.lastModified) }}
</div>
</div>
<div
@@ -433,6 +452,10 @@
<span class="text-gray-600 dark:text-gray-400">Created:</span>
<span class="ml-2">{{ formatDate(repoInfo?.createdAt) }}</span>
</div>
<div v-if="repoInfo?.lastModified">
<span class="text-gray-600 dark:text-gray-400">Updated:</span>
<span class="ml-2">{{ formatDate(repoInfo?.lastModified) }}</span>
</div>
<div v-if="repoInfo?.sha">
<span class="text-gray-600 dark:text-gray-400">Commit:</span>
<span class="ml-2 font-mono text-xs">{{
@@ -551,6 +574,7 @@ const pathSegments = computed(() => {
});
const filteredFiles = computed(() => {
// Backend now provides folder stats, so just filter
if (!fileSearchQuery.value) return fileTree.value;
const query = fileSearchQuery.value.toLowerCase();
@@ -582,6 +606,15 @@ function formatSize(bytes) {
return (bytes / (1024 * 1024 * 1024)).toFixed(1) + " GB";
}
function formatLastModified(dateString) {
if (!dateString) return "-";
try {
return dayjs(dateString).fromNow();
} catch (e) {
return "-";
}
}
function getFileName(path) {
const parts = path.split("/");
return parts[parts.length - 1] || path;

View File

@@ -468,17 +468,53 @@ async def list_repo_tree(
# Remove prefix from path to get relative path
relative_path = obj.path[prefix_len:] if prefix else obj.path
# Calculate folder stats by listing its contents recursively
folder_size = 0
folder_latest_mtime = None
try:
# List all objects in this folder recursively
folder_contents = await async_client.list_objects(
repository=lakefs_repo,
ref=revision,
prefix=obj.path, # Use full path as prefix
delimiter="", # No delimiter = recursive
amount=1000,
)
# Calculate total size and find latest modification
for child_obj in folder_contents.results:
if child_obj.path_type == "object":
folder_size += child_obj.size_bytes or 0
if hasattr(child_obj, "mtime") and child_obj.mtime:
if (
folder_latest_mtime is None
or child_obj.mtime > folder_latest_mtime
):
folder_latest_mtime = child_obj.mtime
except Exception as e:
logger.debug(
f"Could not calculate stats for folder {obj.path}: {str(e)}"
)
dir_obj = {
"type": "directory",
"oid": (
obj.checksum if hasattr(obj, "checksum") and obj.checksum else ""
),
"size": 0,
"size": folder_size,
"path": relative_path.rstrip("/"), # Remove trailing slash
}
# Add last modified info if available
if hasattr(obj, "mtime") and obj.mtime:
# Add last modified info
if folder_latest_mtime:
from datetime import datetime
dir_obj["lastModified"] = datetime.fromtimestamp(
folder_latest_mtime
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
elif hasattr(obj, "mtime") and obj.mtime:
from datetime import datetime
dir_obj["lastModified"] = datetime.fromtimestamp(obj.mtime).strftime(

View File

@@ -509,6 +509,7 @@ async def commit(
# Process operations
files_changed = False # Track if any files actually changed
pending_lfs_tracking = [] # Track LFS objects to record in history after commit
for op in operations:
key = op["key"]
@@ -698,35 +699,15 @@ async def commit(
logger.success(f"Updated database record for LFS file: {path}")
# NOW delete the old LFS object if it exists and is not used elsewhere
if old_lfs_oid:
# Check if this OID is still used by other files (deduplication check)
other_uses = (
File.select()
.where((File.sha256 == old_lfs_oid) & (File.lfs == True))
.count()
)
if other_uses == 0:
# Safe to delete - not used anywhere
old_lfs_key = (
f"lfs/{old_lfs_oid[:2]}/{old_lfs_oid[2:4]}/{old_lfs_oid}"
)
try:
from .s3_utils import get_s3_client
s3_client = get_s3_client()
s3_client.delete_object(Bucket=cfg.s3.bucket, Key=old_lfs_key)
logger.success(f"Deleted old LFS object: {old_lfs_key}")
except Exception as e:
# Log but don't fail - the new file is already linked successfully
logger.warning(
f"Failed to delete old LFS object {old_lfs_key}: {e}"
)
else:
logger.success(
f"Keeping old LFS object {old_lfs_oid} - still used by {other_uses} file(s)"
)
# Track this LFS object for GC after commit
pending_lfs_tracking.append(
{
"path": path,
"sha256": oid,
"size": size,
"old_sha256": old_lfs_oid,
}
)
elif key == "deletedFile":
# Delete a single file
@@ -967,6 +948,32 @@ async def commit(
commit_url = f"{cfg.app.base_url}/{repo_id}/commit/{commit_result.id}"
logger.success(f"Commit URL: {commit_url}")
# Now that we have commit_id, track LFS objects and run GC
if pending_lfs_tracking:
from .gc_utils import track_lfs_object, run_gc_for_file
for lfs_info in pending_lfs_tracking:
# Track the new LFS object in history
track_lfs_object(
repo_full_id=repo_id,
path_in_repo=lfs_info["path"],
sha256=lfs_info["sha256"],
size=lfs_info["size"],
commit_id=commit_result.id,
)
# Run GC for this file if enabled
if cfg.app.lfs_auto_gc and lfs_info.get("old_sha256"):
deleted_count = run_gc_for_file(
repo_full_id=repo_id,
path_in_repo=lfs_info["path"],
current_commit_id=commit_result.id,
)
if deleted_count > 0:
logger.info(
f"GC: Cleaned up {deleted_count} old version(s) of {lfs_info['path']}"
)
return {
"commitUrl": commit_url,
"commitOid": commit_result.id,

View File

@@ -0,0 +1,201 @@
"""Garbage collection utilities for LFS objects."""
from typing import List, Optional
from datetime import datetime, timezone
from ..db import LFSObjectHistory, File
from ..config import cfg
from ..logger import get_logger
logger = get_logger("GC")
def track_lfs_object(
repo_full_id: str,
path_in_repo: str,
sha256: str,
size: int,
commit_id: str,
):
"""Track LFS object usage in a commit.
Args:
repo_full_id: Full repository ID (namespace/name)
path_in_repo: File path in repository
sha256: LFS object SHA256 hash
size: Object size in bytes
commit_id: LakeFS commit ID
"""
LFSObjectHistory.create(
repo_full_id=repo_full_id,
path_in_repo=path_in_repo,
sha256=sha256,
size=size,
commit_id=commit_id,
)
logger.debug(
f"Tracked LFS object {sha256[:8]} for {path_in_repo} in commit {commit_id[:8]}"
)
def get_old_lfs_versions(
repo_full_id: str,
path_in_repo: str,
keep_count: int,
) -> List[str]:
"""Get old LFS object hashes that should be garbage collected.
Args:
repo_full_id: Full repository ID
path_in_repo: File path
keep_count: Number of versions to keep
Returns:
List of SHA256 hashes to delete
"""
# Get all historical versions for this file, sorted by creation date (newest first)
history = (
LFSObjectHistory.select()
.where(
(LFSObjectHistory.repo_full_id == repo_full_id)
& (LFSObjectHistory.path_in_repo == path_in_repo)
)
.order_by(LFSObjectHistory.created_at.desc())
)
all_versions = list(history)
if len(all_versions) <= keep_count:
# Not enough versions to trigger GC
logger.debug(
f"Only {len(all_versions)} versions of {path_in_repo}, keeping all"
)
return []
# Keep the newest K versions, delete the rest
versions_to_keep = all_versions[:keep_count]
versions_to_delete = all_versions[keep_count:]
keep_hashes = {v.sha256 for v in versions_to_keep}
delete_hashes = []
for old_version in versions_to_delete:
# Only delete if not in the "keep" set (shouldn't happen, but safety check)
if old_version.sha256 not in keep_hashes:
delete_hashes.append(old_version.sha256)
logger.info(
f"GC for {path_in_repo}: keeping {len(versions_to_keep)} versions, "
f"marking {len(delete_hashes)} for deletion"
)
return delete_hashes
def cleanup_lfs_object(sha256: str, repo_full_id: Optional[str] = None) -> bool:
"""Delete an LFS object from S3 if it's not used anywhere.
Args:
sha256: LFS object hash
repo_full_id: Optional - restrict check to specific repo
Returns:
True if deleted, False if still in use or deletion failed
"""
# Check if this object is still referenced in current files
query = File.select().where((File.sha256 == sha256) & (File.lfs == True))
if repo_full_id:
query = query.where(File.repo_full_id == repo_full_id)
current_uses = query.count()
if current_uses > 0:
logger.debug(
f"LFS object {sha256[:8]} still used by {current_uses} file(s), keeping"
)
return False
# Check if this object is referenced in any commit history (other repos might use it)
if not repo_full_id:
# Global check across all repos
history_uses = (
LFSObjectHistory.select().where(LFSObjectHistory.sha256 == sha256).count()
)
if history_uses > 0:
logger.debug(
f"LFS object {sha256[:8]} in history ({history_uses} references), keeping"
)
return False
# Safe to delete from S3
try:
from .s3_utils import get_s3_client
lfs_key = f"lfs/{sha256[:2]}/{sha256[2:4]}/{sha256}"
s3_client = get_s3_client()
s3_client.delete_object(Bucket=cfg.s3.bucket, Key=lfs_key)
logger.success(f"Deleted LFS object from S3: {lfs_key}")
# Remove from history table
if repo_full_id:
deleted_count = (
LFSObjectHistory.delete()
.where(
(LFSObjectHistory.repo_full_id == repo_full_id)
& (LFSObjectHistory.sha256 == sha256)
)
.execute()
)
else:
deleted_count = (
LFSObjectHistory.delete()
.where(LFSObjectHistory.sha256 == sha256)
.execute()
)
logger.info(f"Removed {deleted_count} history records for {sha256[:8]}")
return True
except Exception as e:
logger.exception(f"Failed to delete LFS object {sha256[:8]}", e)
return False
def run_gc_for_file(
repo_full_id: str,
path_in_repo: str,
current_commit_id: str,
) -> int:
"""Run garbage collection for a specific file.
Args:
repo_full_id: Full repository ID
path_in_repo: File path
current_commit_id: Current commit ID
Returns:
Number of objects deleted
"""
if not cfg.app.lfs_auto_gc:
logger.debug("Auto GC disabled, skipping")
return 0
keep_count = cfg.app.lfs_keep_versions
old_hashes = get_old_lfs_versions(repo_full_id, path_in_repo, keep_count)
if not old_hashes:
return 0
deleted_count = 0
for sha256 in old_hashes:
if cleanup_lfs_object(sha256, repo_full_id):
deleted_count += 1
if deleted_count > 0:
logger.success(
f"GC completed for {path_in_repo}: deleted {deleted_count} old version(s)"
)
return deleted_count

View File

@@ -26,9 +26,12 @@ def get_version():
Returns:
Site identification and version information
"""
from ..config import cfg
return {
"site": "kohakuhub",
"version": "0.0.1",
"name": cfg.app.site_name,
}
@@ -86,6 +89,8 @@ def whoami_v2(user: User = Depends(get_optional_user)):
}
)
from ..config import cfg
return {
"type": "user",
"id": str(user.id),
@@ -100,4 +105,10 @@ def whoami_v2(user: User = Depends(get_optional_user)):
"type": "access_token",
"accessToken": {"displayName": "Auto-generated token", "role": "write"},
},
# KohakuHub-specific fields
"site": {
"name": cfg.app.site_name, # Configurable site name
"api": "kohakuhub", # Hardcoded API identifier
"version": "0.0.1", # Hardcoded version
},
}

View File

@@ -52,6 +52,8 @@ class AppConfig(BaseModel):
# LFS Garbage Collection settings
lfs_keep_versions: int = 5 # Keep last K versions of each file
lfs_auto_gc: bool = False # Auto-delete old LFS objects on commit
# Site identification
site_name: str = "KohakuHub" # Configurable site name (e.g., "MyCompany Hub")
class Config(BaseModel):
@@ -117,6 +119,10 @@ def load_config(path: str = None) -> Config:
lfs_threshold_bytes=int(
os.environ.get("KOHAKU_HUB_LFS_THRESHOLD_BYTES", "10000000")
),
lfs_keep_versions=int(os.environ.get("KOHAKU_HUB_LFS_KEEP_VERSIONS", "5")),
lfs_auto_gc=os.environ.get("KOHAKU_HUB_LFS_AUTO_GC", "false").lower()
== "true",
site_name=os.environ.get("KOHAKU_HUB_SITE_NAME", "KohakuHub"),
)
return Config(