+
{{ getFileName(file.path) }}
-
+
{{ formatSize(file.size) }}
+
+ {{ formatLastModified(file.lastModified) }}
+
Created:
{{ formatDate(repoInfo?.createdAt) }}
+
+ Updated:
+ {{ formatDate(repoInfo?.lastModified) }}
+
Commit:
{{
@@ -551,6 +574,7 @@ const pathSegments = computed(() => {
});
const filteredFiles = computed(() => {
+ // Backend now provides folder stats, so just filter
if (!fileSearchQuery.value) return fileTree.value;
const query = fileSearchQuery.value.toLowerCase();
@@ -582,6 +606,15 @@ function formatSize(bytes) {
return (bytes / (1024 * 1024 * 1024)).toFixed(1) + " GB";
}
+function formatLastModified(dateString) {
+ if (!dateString) return "-";
+ try {
+ return dayjs(dateString).fromNow();
+ } catch (e) {
+ return "-";
+ }
+}
+
function getFileName(path) {
const parts = path.split("/");
return parts[parts.length - 1] || path;
diff --git a/src/kohakuhub/api/basic.py b/src/kohakuhub/api/basic.py
index 6e18fb0..79e2808 100644
--- a/src/kohakuhub/api/basic.py
+++ b/src/kohakuhub/api/basic.py
@@ -468,17 +468,53 @@ async def list_repo_tree(
# Remove prefix from path to get relative path
relative_path = obj.path[prefix_len:] if prefix else obj.path
+ # Calculate folder stats by listing its contents recursively
+ folder_size = 0
+ folder_latest_mtime = None
+
+ try:
+ # List all objects in this folder recursively
+ folder_contents = await async_client.list_objects(
+ repository=lakefs_repo,
+ ref=revision,
+ prefix=obj.path, # Use full path as prefix
+ delimiter="", # No delimiter = recursive
+ amount=1000,
+ )
+
+ # Calculate total size and find latest modification
+ for child_obj in folder_contents.results:
+ if child_obj.path_type == "object":
+ folder_size += child_obj.size_bytes or 0
+ if hasattr(child_obj, "mtime") and child_obj.mtime:
+ if (
+ folder_latest_mtime is None
+ or child_obj.mtime > folder_latest_mtime
+ ):
+ folder_latest_mtime = child_obj.mtime
+
+ except Exception as e:
+ logger.debug(
+ f"Could not calculate stats for folder {obj.path}: {str(e)}"
+ )
+
dir_obj = {
"type": "directory",
"oid": (
obj.checksum if hasattr(obj, "checksum") and obj.checksum else ""
),
- "size": 0,
+ "size": folder_size,
"path": relative_path.rstrip("/"), # Remove trailing slash
}
- # Add last modified info if available
- if hasattr(obj, "mtime") and obj.mtime:
+ # Add last modified info
+ if folder_latest_mtime:
+ from datetime import datetime
+
+ dir_obj["lastModified"] = datetime.fromtimestamp(
+ folder_latest_mtime
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
+ elif hasattr(obj, "mtime") and obj.mtime:
from datetime import datetime
dir_obj["lastModified"] = datetime.fromtimestamp(obj.mtime).strftime(
diff --git a/src/kohakuhub/api/file.py b/src/kohakuhub/api/file.py
index 0fdab0d..d6985bb 100644
--- a/src/kohakuhub/api/file.py
+++ b/src/kohakuhub/api/file.py
@@ -509,6 +509,7 @@ async def commit(
# Process operations
files_changed = False # Track if any files actually changed
+ pending_lfs_tracking = [] # Track LFS objects to record in history after commit
for op in operations:
key = op["key"]
@@ -698,35 +699,15 @@ async def commit(
logger.success(f"Updated database record for LFS file: {path}")
- # NOW delete the old LFS object if it exists and is not used elsewhere
- if old_lfs_oid:
- # Check if this OID is still used by other files (deduplication check)
- other_uses = (
- File.select()
- .where((File.sha256 == old_lfs_oid) & (File.lfs == True))
- .count()
- )
-
- if other_uses == 0:
- # Safe to delete - not used anywhere
- old_lfs_key = (
- f"lfs/{old_lfs_oid[:2]}/{old_lfs_oid[2:4]}/{old_lfs_oid}"
- )
- try:
- from .s3_utils import get_s3_client
-
- s3_client = get_s3_client()
- s3_client.delete_object(Bucket=cfg.s3.bucket, Key=old_lfs_key)
- logger.success(f"Deleted old LFS object: {old_lfs_key}")
- except Exception as e:
- # Log but don't fail - the new file is already linked successfully
- logger.warning(
- f"Failed to delete old LFS object {old_lfs_key}: {e}"
- )
- else:
- logger.success(
- f"Keeping old LFS object {old_lfs_oid} - still used by {other_uses} file(s)"
- )
+ # Track this LFS object for GC after commit
+ pending_lfs_tracking.append(
+ {
+ "path": path,
+ "sha256": oid,
+ "size": size,
+ "old_sha256": old_lfs_oid,
+ }
+ )
elif key == "deletedFile":
# Delete a single file
@@ -967,6 +948,32 @@ async def commit(
commit_url = f"{cfg.app.base_url}/{repo_id}/commit/{commit_result.id}"
logger.success(f"Commit URL: {commit_url}")
+ # Now that we have commit_id, track LFS objects and run GC
+ if pending_lfs_tracking:
+ from .gc_utils import track_lfs_object, run_gc_for_file
+
+ for lfs_info in pending_lfs_tracking:
+ # Track the new LFS object in history
+ track_lfs_object(
+ repo_full_id=repo_id,
+ path_in_repo=lfs_info["path"],
+ sha256=lfs_info["sha256"],
+ size=lfs_info["size"],
+ commit_id=commit_result.id,
+ )
+
+ # Run GC for this file if enabled
+ if cfg.app.lfs_auto_gc and lfs_info.get("old_sha256"):
+ deleted_count = run_gc_for_file(
+ repo_full_id=repo_id,
+ path_in_repo=lfs_info["path"],
+ current_commit_id=commit_result.id,
+ )
+ if deleted_count > 0:
+ logger.info(
+ f"GC: Cleaned up {deleted_count} old version(s) of {lfs_info['path']}"
+ )
+
return {
"commitUrl": commit_url,
"commitOid": commit_result.id,
diff --git a/src/kohakuhub/api/gc_utils.py b/src/kohakuhub/api/gc_utils.py
new file mode 100644
index 0000000..face7b1
--- /dev/null
+++ b/src/kohakuhub/api/gc_utils.py
@@ -0,0 +1,201 @@
+"""Garbage collection utilities for LFS objects."""
+
+from typing import List, Optional
+from datetime import datetime, timezone
+
+from ..db import LFSObjectHistory, File
+from ..config import cfg
+from ..logger import get_logger
+
+logger = get_logger("GC")
+
+
+def track_lfs_object(
+ repo_full_id: str,
+ path_in_repo: str,
+ sha256: str,
+ size: int,
+ commit_id: str,
+):
+ """Track LFS object usage in a commit.
+
+ Args:
+ repo_full_id: Full repository ID (namespace/name)
+ path_in_repo: File path in repository
+ sha256: LFS object SHA256 hash
+ size: Object size in bytes
+ commit_id: LakeFS commit ID
+ """
+ LFSObjectHistory.create(
+ repo_full_id=repo_full_id,
+ path_in_repo=path_in_repo,
+ sha256=sha256,
+ size=size,
+ commit_id=commit_id,
+ )
+ logger.debug(
+ f"Tracked LFS object {sha256[:8]} for {path_in_repo} in commit {commit_id[:8]}"
+ )
+
+
+def get_old_lfs_versions(
+ repo_full_id: str,
+ path_in_repo: str,
+ keep_count: int,
+) -> List[str]:
+ """Get old LFS object hashes that should be garbage collected.
+
+ Args:
+ repo_full_id: Full repository ID
+ path_in_repo: File path
+ keep_count: Number of versions to keep
+
+ Returns:
+ List of SHA256 hashes to delete
+ """
+ # Get all historical versions for this file, sorted by creation date (newest first)
+ history = (
+ LFSObjectHistory.select()
+ .where(
+ (LFSObjectHistory.repo_full_id == repo_full_id)
+ & (LFSObjectHistory.path_in_repo == path_in_repo)
+ )
+ .order_by(LFSObjectHistory.created_at.desc())
+ )
+
+ all_versions = list(history)
+
+ if len(all_versions) <= keep_count:
+ # Not enough versions to trigger GC
+ logger.debug(
+ f"Only {len(all_versions)} versions of {path_in_repo}, keeping all"
+ )
+ return []
+
+ # Keep the newest K versions, delete the rest
+ versions_to_keep = all_versions[:keep_count]
+ versions_to_delete = all_versions[keep_count:]
+
+ keep_hashes = {v.sha256 for v in versions_to_keep}
+ delete_hashes = []
+
+ for old_version in versions_to_delete:
+ # Only delete if not in the "keep" set (shouldn't happen, but safety check)
+ if old_version.sha256 not in keep_hashes:
+ delete_hashes.append(old_version.sha256)
+
+ logger.info(
+ f"GC for {path_in_repo}: keeping {len(versions_to_keep)} versions, "
+ f"marking {len(delete_hashes)} for deletion"
+ )
+
+ return delete_hashes
+
+
+def cleanup_lfs_object(sha256: str, repo_full_id: Optional[str] = None) -> bool:
+ """Delete an LFS object from S3 if it's not used anywhere.
+
+ Args:
+ sha256: LFS object hash
+ repo_full_id: Optional - restrict check to specific repo
+
+ Returns:
+ True if deleted, False if still in use or deletion failed
+ """
+ # Check if this object is still referenced in current files
+ query = File.select().where((File.sha256 == sha256) & (File.lfs == True))
+ if repo_full_id:
+ query = query.where(File.repo_full_id == repo_full_id)
+
+ current_uses = query.count()
+
+ if current_uses > 0:
+ logger.debug(
+ f"LFS object {sha256[:8]} still used by {current_uses} file(s), keeping"
+ )
+ return False
+
+ # Check if this object is referenced in any commit history (other repos might use it)
+ if not repo_full_id:
+ # Global check across all repos
+ history_uses = (
+ LFSObjectHistory.select().where(LFSObjectHistory.sha256 == sha256).count()
+ )
+
+ if history_uses > 0:
+ logger.debug(
+ f"LFS object {sha256[:8]} in history ({history_uses} references), keeping"
+ )
+ return False
+
+ # Safe to delete from S3
+ try:
+ from .s3_utils import get_s3_client
+
+ lfs_key = f"lfs/{sha256[:2]}/{sha256[2:4]}/{sha256}"
+ s3_client = get_s3_client()
+ s3_client.delete_object(Bucket=cfg.s3.bucket, Key=lfs_key)
+
+ logger.success(f"Deleted LFS object from S3: {lfs_key}")
+
+ # Remove from history table
+ if repo_full_id:
+ deleted_count = (
+ LFSObjectHistory.delete()
+ .where(
+ (LFSObjectHistory.repo_full_id == repo_full_id)
+ & (LFSObjectHistory.sha256 == sha256)
+ )
+ .execute()
+ )
+ else:
+ deleted_count = (
+ LFSObjectHistory.delete()
+ .where(LFSObjectHistory.sha256 == sha256)
+ .execute()
+ )
+
+ logger.info(f"Removed {deleted_count} history records for {sha256[:8]}")
+ return True
+
+ except Exception as e:
+ logger.exception(f"Failed to delete LFS object {sha256[:8]}", e)
+ return False
+
+
+def run_gc_for_file(
+ repo_full_id: str,
+ path_in_repo: str,
+ current_commit_id: str,
+) -> int:
+ """Run garbage collection for a specific file.
+
+ Args:
+ repo_full_id: Full repository ID
+ path_in_repo: File path
+ current_commit_id: Current commit ID
+
+ Returns:
+ Number of objects deleted
+ """
+ if not cfg.app.lfs_auto_gc:
+ logger.debug("Auto GC disabled, skipping")
+ return 0
+
+ keep_count = cfg.app.lfs_keep_versions
+ old_hashes = get_old_lfs_versions(repo_full_id, path_in_repo, keep_count)
+
+ if not old_hashes:
+ return 0
+
+ deleted_count = 0
+ for sha256 in old_hashes:
+ if cleanup_lfs_object(sha256, repo_full_id):
+ deleted_count += 1
+
+ if deleted_count > 0:
+ logger.success(
+ f"GC completed for {path_in_repo}: deleted {deleted_count} old version(s)"
+ )
+
+ return deleted_count
diff --git a/src/kohakuhub/api/utils.py b/src/kohakuhub/api/utils.py
index 13dc67d..8129a37 100644
--- a/src/kohakuhub/api/utils.py
+++ b/src/kohakuhub/api/utils.py
@@ -26,9 +26,12 @@ def get_version():
Returns:
Site identification and version information
"""
+ from ..config import cfg
+
return {
"site": "kohakuhub",
"version": "0.0.1",
+ "name": cfg.app.site_name,
}
@@ -86,6 +89,8 @@ def whoami_v2(user: User = Depends(get_optional_user)):
}
)
+ from ..config import cfg
+
return {
"type": "user",
"id": str(user.id),
@@ -100,4 +105,10 @@ def whoami_v2(user: User = Depends(get_optional_user)):
"type": "access_token",
"accessToken": {"displayName": "Auto-generated token", "role": "write"},
},
+ # KohakuHub-specific fields
+ "site": {
+ "name": cfg.app.site_name, # Configurable site name
+ "api": "kohakuhub", # Hardcoded API identifier
+ "version": "0.0.1", # Hardcoded version
+ },
}
diff --git a/src/kohakuhub/config.py b/src/kohakuhub/config.py
index 4c50cfb..bd92128 100644
--- a/src/kohakuhub/config.py
+++ b/src/kohakuhub/config.py
@@ -52,6 +52,8 @@ class AppConfig(BaseModel):
# LFS Garbage Collection settings
lfs_keep_versions: int = 5 # Keep last K versions of each file
lfs_auto_gc: bool = False # Auto-delete old LFS objects on commit
+ # Site identification
+ site_name: str = "KohakuHub" # Configurable site name (e.g., "MyCompany Hub")
class Config(BaseModel):
@@ -117,6 +119,10 @@ def load_config(path: str = None) -> Config:
lfs_threshold_bytes=int(
os.environ.get("KOHAKU_HUB_LFS_THRESHOLD_BYTES", "10000000")
),
+ lfs_keep_versions=int(os.environ.get("KOHAKU_HUB_LFS_KEEP_VERSIONS", "5")),
+ lfs_auto_gc=os.environ.get("KOHAKU_HUB_LFS_AUTO_GC", "false").lower()
+ == "true",
+ site_name=os.environ.get("KOHAKU_HUB_SITE_NAME", "KohakuHub"),
)
return Config(