Add HF-compatible tree expand metadata

2026-05-06 04:17:46 -05:00 · 2026-04-19 19:59:57 +08:00
parent 82ea9d9272
commit 548c9ae844
5 changed files with 702 additions and 356 deletions
--- a/src/kohaku-hub-ui/src/components/repo/RepoViewer.vue
+++ b/src/kohaku-hub-ui/src/components/repo/RepoViewer.vue
@@ -451,20 +451,21 @@
            <template v-else>
              <!-- Header Row (desktop only) -->
              <div
-                class="hidden md:grid md:grid-cols-[auto_1fr_120px_150px] gap-3 py-2 px-2 text-sm font-medium text-gray-600 dark:text-gray-400 border-b"
+                class="hidden md:grid md:grid-cols-[auto_minmax(0,1.4fr)_minmax(0,2fr)_120px_110px] gap-3 py-2 px-2 text-sm font-medium text-gray-600 dark:text-gray-400 border-b"
              >
                <div></div>
                <!-- Icon column -->
                <div>Name</div>
+                <div>Last Commit</div>
+                <div class="text-right">Updated</div>
                <div class="text-right">Size</div>
-                <div class="text-right">Last Modified</div>
              </div>

              <!-- File Rows -->
              <div
                v-for="file in filteredFiles"
                :key="file.path"
-                class="py-3 grid grid-cols-[auto_1fr_auto] md:grid-cols-[auto_1fr_120px_150px] gap-3 items-center hover:bg-gray-50 dark:hover:bg-gray-700 px-2 cursor-pointer transition-colors"
+                class="py-3 grid grid-cols-[auto_1fr] md:grid-cols-[auto_minmax(0,1.4fr)_minmax(0,2fr)_120px_110px] gap-3 items-center hover:bg-gray-50 dark:hover:bg-gray-700 px-2 cursor-pointer transition-colors"
                @click="handleFileClick(file)"
              >
                <div
@@ -479,16 +480,34 @@
                  <div class="font-medium truncate">
                    {{ getFileName(file.path) }}
                  </div>
+                  <div
+                    class="mt-1 text-sm text-gray-500 dark:text-gray-400 truncate md:hidden"
+                  >
+                    {{ getEntryCommitTitle(file) }}
+                  </div>
+                  <div
+                    class="mt-1 text-xs text-gray-400 dark:text-gray-500 md:hidden"
+                  >
+                    {{ getEntryUpdatedAt(file) }}
+                    <span v-if="formatEntrySize(file) !== '-'">
+                      · {{ formatEntrySize(file) }}
+                    </span>
+                  </div>
                </div>
                <div
-                  class="text-sm text-gray-500 dark:text-gray-400 text-right"
+                  class="hidden md:block min-w-0 text-sm text-gray-500 dark:text-gray-400 truncate"
                >
-                  {{ formatSize(file.size) }}
+                  {{ getEntryCommitTitle(file) }}
                </div>
                <div
                  class="hidden md:block text-sm text-gray-500 dark:text-gray-400 text-right"
                >
-                  {{ formatLastModified(file.lastModified) }}
+                  {{ getEntryUpdatedAt(file) }}
+                </div>
+                <div
+                  class="hidden md:block text-sm text-gray-500 dark:text-gray-400 text-right"
+                >
+                  {{ formatEntrySize(file) }}
                </div>
              </div>

@@ -866,8 +885,10 @@ const isLiked = ref(false);
 const likesCount = ref(0);
 const likingInProgress = ref(false);
 const deletingFolder = ref(false);
+const fileTreeRequestId = ref(0);

 const baseUrl = window.location.origin;
+const PATHS_INFO_BATCH_SIZE = 1000;

 // Computed
 const activeTab = computed(() => props.tab);
@@ -903,7 +924,6 @@ const pathSegments = computed(() => {
 });

 const filteredFiles = computed(() => {
-  // Backend now provides folder stats, so just filter
  if (!fileSearchQuery.value) return fileTree.value;

  const query = fileSearchQuery.value.toLowerCase();
@@ -1006,8 +1026,17 @@ function formatSize(bytes) {
  return (bytes / (1000 * 1000 * 1000)).toFixed(1) + " GB";
 }

-function formatLastModified(dateString) {
-  return formatRelativeTime(dateString, "-");
+function formatEntrySize(file) {
+  if (file.type === "directory") return "-";
+  return formatSize(file.size);
+}
+
+function getEntryCommitTitle(file) {
+  return file.lastCommit?.title || "-";
+}
+
+function getEntryUpdatedAt(file) {
+  return formatRelativeTime(file.lastCommit?.date || file.lastModified, "-");
 }

 function getFileName(path) {
@@ -1015,6 +1044,22 @@ function getFileName(path) {
  return parts[parts.length - 1] || path;
 }

+function sortFileEntries(entries) {
+  return [...entries].sort((a, b) => {
+    if (a.type === "directory" && b.type !== "directory") return -1;
+    if (a.type !== "directory" && b.type === "directory") return 1;
+    return a.path.localeCompare(b.path);
+  });
+}
+
+function chunkPaths(paths, size) {
+  const chunks = [];
+  for (let index = 0; index < paths.length; index += size) {
+    chunks.push(paths.slice(index, index + size));
+  }
+  return chunks;
+}
+
 function navigateToTab(tab) {
  switch (tab) {
    case "files":
@@ -1164,8 +1209,13 @@ async function toggleLike() {

 async function loadFileTree() {
  filesLoading.value = true;
+  const requestId = fileTreeRequestId.value + 1;
+  fileTreeRequestId.value = requestId;
+
+  let sortedEntries = [];
+
  try {
-    const { data } = await repoAPI.listTree(
+    const data = await repoAPI.listTreeAll(
      props.repoType,
      props.namespace,
      props.name,
@@ -1174,16 +1224,60 @@ async function loadFileTree() {
      { recursive: false },
    );

-    fileTree.value = data.sort((a, b) => {
-      if (a.type === "directory" && b.type !== "directory") return -1;
-      if (a.type !== "directory" && b.type === "directory") return 1;
-      return a.path.localeCompare(b.path);
-    });
+    if (requestId !== fileTreeRequestId.value) return;
+
+    sortedEntries = sortFileEntries(data || []);
+    fileTree.value = sortedEntries;
+
+    if (sortedEntries.length === 0) {
+      return;
+    }
+
  } catch (err) {
    console.error("Failed to load file tree:", err);
-    fileTree.value = [];
+    if (requestId === fileTreeRequestId.value) {
+      fileTree.value = [];
+    }
  } finally {
-    filesLoading.value = false;
+    if (requestId === fileTreeRequestId.value) {
+      filesLoading.value = false;
+    }
+  }
+
+  if (sortedEntries.length === 0 || requestId !== fileTreeRequestId.value) {
+    return;
+  }
+
+  try {
+    const pathInfoByPath = new Map();
+    const pathBatches = chunkPaths(
+      sortedEntries.map((file) => file.path),
+      PATHS_INFO_BATCH_SIZE,
+    );
+
+    for (const pathBatch of pathBatches) {
+      const { data: expandedEntries } = await repoAPI.getPathsInfo(
+        props.repoType,
+        props.namespace,
+        props.name,
+        currentBranch.value,
+        pathBatch,
+        true,
+      );
+
+      if (requestId !== fileTreeRequestId.value) return;
+
+      for (const entry of expandedEntries || []) {
+        pathInfoByPath.set(entry.path, entry);
+      }
+    }
+
+    fileTree.value = sortedEntries.map((file) => ({
+      ...file,
+      ...(pathInfoByPath.get(file.path) || {}),
+    }));
+  } catch (err) {
+    console.error("Failed to load expanded path info:", err);
  }
 }

--- a/src/kohaku-hub-ui/src/utils/api.js
+++ b/src/kohaku-hub-ui/src/utils/api.js
@@ -39,6 +39,12 @@ api.interceptors.response.use(

 export default api;

+function getNextLinkFromHeader(linkHeader) {
+  if (!linkHeader) return null;
+  const match = linkHeader.match(/<([^>]+)>;\s*rel="next"/i);
+  return match ? match[1] : null;
+}
+
 /**
 * Auth API
 */
@@ -140,6 +146,65 @@ export const repoAPI = {
      params,
    }),

+  /**
+   * List all repository file tree entries by following Link pagination
+   * @param {string} type - Repository type
+   * @param {string} namespace - Owner namespace
+   * @param {string} name - Repository name
+   * @param {string} revision - Branch name or commit hash
+   * @param {string} path - Path within repository (with leading /)
+   * @param {Object} params - Query parameters { recursive?, expand? }
+   * @returns {Promise<Array>} - Array of files and directories across all pages
+   */
+  listTreeAll: async (type, namespace, name, revision, path, params) => {
+    const entries = [];
+    let response = await repoAPI.listTree(
+      type,
+      namespace,
+      name,
+      revision,
+      path,
+      params,
+    );
+
+    entries.push(...(response.data || []));
+
+    let nextUrl = getNextLinkFromHeader(response.headers?.link);
+    while (nextUrl) {
+      response = await api.get(nextUrl);
+      entries.push(...(response.data || []));
+      nextUrl = getNextLinkFromHeader(response.headers?.link);
+    }
+
+    return entries;
+  },
+
+  /**
+   * Get repository metadata for specific paths
+   * @param {string} type - Repository type
+   * @param {string} namespace - Owner namespace
+   * @param {string} name - Repository name
+   * @param {string} revision - Branch name or commit hash
+   * @param {Array<string>} paths - Repository-relative paths
+   * @param {boolean} expand - Whether to request expanded metadata
+   * @returns {Promise} - Array of files and directories
+   */
+  getPathsInfo: (type, namespace, name, revision, paths, expand = false) => {
+    const formData = new URLSearchParams();
+    paths.forEach((path) => formData.append("paths", path));
+    formData.append("expand", expand ? "true" : "false");
+
+    return api.post(
+      `/api/${type}s/${namespace}/${name}/paths-info/${revision}`,
+      formData,
+      {
+        headers: {
+          "Content-Type": "application/x-www-form-urlencoded",
+        },
+      },
+    );
+  },
+
  /**
   * Upload files to repository
   * @param {string} type - Repository type
--- a/src/kohakuhub/api/fallback/decorators.py
+++ b/src/kohakuhub/api/fallback/decorators.py
@@ -189,12 +189,20 @@ def with_repo_fallback(operation: OperationType):
                    case "tree":
                        revision = kwargs.get("revision", "main")
                        path = kwargs.get("path", "")
+                        recursive = kwargs.get("recursive", False)
+                        expand = kwargs.get("expand", False)
+                        limit = kwargs.get("limit")
+                        cursor = kwargs.get("cursor")
                        result = await try_fallback_tree(
                            repo_type,
                            namespace,
                            name,
                            revision,
                            path,
+                            recursive=recursive,
+                            expand=expand,
+                            limit=limit,
+                            cursor=cursor,
                            user_tokens=user_tokens,
                        )

@@ -207,12 +215,14 @@ def with_repo_fallback(operation: OperationType):
                        # For paths-info, extract paths and revision from kwargs
                        revision = kwargs.get("revision", "main")
                        paths = kwargs.get("paths", [])
+                        expand = kwargs.get("expand", False)
                        result = await try_fallback_paths_info(
                            repo_type,
                            namespace,
                            name,
                            revision,
                            paths,
+                            expand=expand,
                            user_tokens=user_tokens,
                        )

--- a/src/kohakuhub/api/fallback/operations.py
+++ b/src/kohakuhub/api/fallback/operations.py
@@ -258,8 +258,12 @@ async def try_fallback_tree(
    name: str,
    revision: str,
    path: str = "",
+    recursive: bool = False,
+    expand: bool = False,
+    limit: int | None = None,
+    cursor: str | None = None,
    user_tokens: dict[str, str] | None = None,
-) -> Optional[list]:
+) -> Optional[Response]:
    """Try to get repository tree from fallback sources.

    Args:
@@ -271,7 +275,7 @@ async def try_fallback_tree(
        user_tokens: User-provided external tokens (overrides admin tokens)

    Returns:
-        List of file/folder objects or None if not found
+        JSON response or None if not found
    """
    sources = get_enabled_sources(namespace, user_tokens=user_tokens)

@@ -291,15 +295,30 @@ async def try_fallback_tree(
                token=source.get("token"),
            )

-            response = await client.get(kohaku_path, repo_type)
+            params = {
+                "recursive": recursive,
+                "expand": expand,
+            }
+            if limit is not None:
+                params["limit"] = limit
+            if cursor:
+                params["cursor"] = cursor
+
+            response = await client.get(kohaku_path, repo_type, params=params)

            if response.status_code == 200:
-                data = response.json()
-
                logger.info(
                    f"Fallback tree SUCCESS: {repo_type}/{namespace}/{name}/tree from {source['name']}"
                )
-                return data
+                headers = {}
+                if response.headers.get("link"):
+                    headers["Link"] = response.headers["link"]
+                return Response(
+                    content=response.content,
+                    status_code=response.status_code,
+                    media_type=response.headers.get("content-type"),
+                    headers=headers,
+                )

            elif not should_retry_source(response):
                return None
@@ -317,6 +336,7 @@ async def try_fallback_paths_info(
    name: str,
    revision: str,
    paths: list[str],
+    expand: bool = False,
    user_tokens: dict[str, str] | None = None,
 ) -> Optional[list]:
    """Try to get paths info from fallback sources.
@@ -351,7 +371,7 @@ async def try_fallback_paths_info(

            # POST request with form data
            response = await client.post(
-                kohaku_path, repo_type, data={"paths": paths, "expand": False}
+                kohaku_path, repo_type, data={"paths": paths, "expand": expand}
            )

            if response.status_code == 200:
--- a/src/kohakuhub/api/repo/routers/tree.py
+++ b/src/kohakuhub/api/repo/routers/tree.py
@@ -1,21 +1,30 @@
-"""Repository tree listing and path information endpoints - Refactored version."""
+"""Repository tree listing and path information endpoints."""

 import asyncio
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Literal
+from urllib.parse import urlencode

-from fastapi import APIRouter, Depends, Form, Request
+from fastapi import APIRouter, Depends, Form, Query, Request
+from fastapi.responses import JSONResponse

 from kohakuhub.config import cfg
-from kohakuhub.constants import DATETIME_FORMAT_ISO
-from kohakuhub.db import File, Repository, User
-from kohakuhub.db_operations import get_file, get_repository, should_use_lfs
-from kohakuhub.logger import get_logger
 from kohakuhub.auth.dependencies import get_optional_user
 from kohakuhub.auth.permissions import check_repo_read_permission
-from kohakuhub.utils.lakefs import get_lakefs_client, lakefs_repo_name
+from kohakuhub.constants import DATETIME_FORMAT_ISO
+from kohakuhub.db import File, Repository, User
+from kohakuhub.db_operations import get_repository, should_use_lfs
+from kohakuhub.lakefs_rest_client import get_lakefs_rest_client
+from kohakuhub.logger import get_logger
+from kohakuhub.utils.lakefs import (
+    get_lakefs_client,
+    lakefs_repo_name,
+    resolve_revision,
+)
 from kohakuhub.api.fallback import with_repo_fallback
 from kohakuhub.api.repo.utils.hf import (
+    hf_bad_request,
+    hf_entry_not_found,
    hf_repo_not_found,
    hf_revision_not_found,
    hf_server_error,
@@ -28,196 +37,363 @@ router = APIRouter()

 RepoType = Literal["model", "dataset", "space"]

-
-async def fetch_lakefs_objects(
-    lakefs_repo: str, revision: str, prefix: str, recursive: bool
-) -> list:
-    """Fetch all objects from LakeFS with pagination.
-
-    Args:
-        lakefs_repo: LakeFS repository name
-        revision: Branch or commit
-        prefix: Path prefix
-        recursive: Whether to list recursively
-
-    Returns:
-        List of all LakeFS objects
-
-    Raises:
-        Exception: If listing fails
-    """
-    client = get_lakefs_client()
-
-    all_results = []
-    after = ""
-    has_more = True
-
-    while has_more:
-        result = await client.list_objects(
-            repository=lakefs_repo,
-            ref=revision,
-            prefix=prefix,
-            delimiter="" if recursive else "/",
-            amount=1000,  # Max per request
-            after=after,
-        )
-
-        all_results.extend(result["results"])
-
-        # Check pagination
-        if result.get("pagination") and result["pagination"].get("has_more"):
-            after = result["pagination"]["next_offset"]
-            has_more = True
-        else:
-            has_more = False
-
-    return all_results
+TREE_PAGE_SIZE = 1000
+TREE_EXPAND_PAGE_SIZE = 50
+TREE_DIFF_PAGE_SIZE = 1000
+TREE_COMMIT_SCAN_PAGE_SIZE = 100
+PATHS_INFO_MAX_PATHS = 1000
+PATHS_INFO_CONCURRENCY = 16


-async def calculate_folder_stats(
-    lakefs_repo: str, revision: str, folder_path: str
-) -> tuple[int, float | None]:
-    """Calculate folder size and latest modification time.
-
-    Args:
-        lakefs_repo: LakeFS repository name
-        revision: Branch or commit
-        folder_path: Full folder path
-
-    Returns:
-        Tuple of (total_size, latest_mtime)
-    """
-    folder_size = 0
-    folder_latest_mtime = None
-
-    try:
-        client = get_lakefs_client()
-
-        # Paginate through all objects in folder
-        after = ""
-        has_more = True
-
-        while has_more:
-            folder_contents = await client.list_objects(
-                repository=lakefs_repo,
-                ref=revision,
-                prefix=folder_path,
-                delimiter="",  # No delimiter = recursive
-                amount=1000,
-                after=after,
-            )
-
-            # Calculate total size and find latest modification
-            for child_obj in folder_contents["results"]:
-                if child_obj["path_type"] == "object":
-                    folder_size += child_obj.get("size_bytes") or 0
-                    if child_obj.get("mtime"):
-                        if (
-                            folder_latest_mtime is None
-                            or child_obj["mtime"] > folder_latest_mtime
-                        ):
-                            folder_latest_mtime = child_obj["mtime"]
-
-            # Check pagination
-            pagination = folder_contents.get("pagination", {})
-            has_more = pagination.get("has_more", False)
-            if has_more:
-                after = pagination.get("next_offset", "")
-
-    except Exception as e:
-        logger.debug(f"Could not calculate stats for folder {folder_path}: {str(e)}")
-
-    return folder_size, folder_latest_mtime
+def _normalize_repo_path(path: str) -> str:
+    """Normalize a repository-relative path."""
+    return path.lstrip("/").rstrip("/")


-async def convert_file_object(obj, repository: Repository) -> dict:
-    """Convert LakeFS file object to HuggingFace format.
+def _format_last_modified(mtime: float | int | None) -> str | None:
+    """Format LakeFS mtime for HF-compatible responses."""
+    if not mtime:
+        return None
+    return datetime.fromtimestamp(mtime, tz=timezone.utc).strftime(DATETIME_FORMAT_ISO)

-    Args:
-        obj: LakeFS object dict
-        repository: Repository object (FK)

-    Returns:
-        HuggingFace formatted file object
-    """
-    # Use repo-specific LFS settings
-    is_lfs = should_use_lfs(repository, obj["path"], obj["size_bytes"])
-
-    # Use full path relative to repository root (HuggingFace spec)
-    file_path = obj["path"]
-
-    # Get correct checksum from database using repository FK
-    file_record = get_file(repository, obj["path"])
-
-    checksum = (
-        file_record.sha256 if file_record and file_record.sha256 else obj["checksum"]
-    )
-
-    file_obj = {
-        "type": "file",
-        "oid": checksum,  # Git blob SHA1 for non-LFS, SHA256 for LFS
-        "size": obj["size_bytes"],
-        "path": file_path,
-    }
-
-    # Add last modified info if available
-    if obj.get("mtime"):
-        file_obj["lastModified"] = datetime.fromtimestamp(obj["mtime"]).strftime(
+def _format_commit_date(value) -> str | None:
+    """Normalize commit dates to the HF datetime wire format."""
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return datetime.fromtimestamp(value, tz=timezone.utc).strftime(
            DATETIME_FORMAT_ISO
        )
+    return value

-    # Add LFS metadata if it's an LFS file
-    if is_lfs:
-        file_obj["lfs"] = {
-            "oid": checksum,  # SHA256 for LFS files
-            "size": obj["size_bytes"],
-            "pointerSize": 134,  # Standard Git LFS pointer size
+
+def _serialize_last_commit(commit: dict) -> dict:
+    """Convert a LakeFS commit payload to the HF wire format."""
+    return {
+        "id": commit["id"],
+        "title": commit.get("message", ""),
+        "date": _format_commit_date(commit.get("creation_date")),
+    }
+
+
+def _build_lfs_payload(checksum: str, size_bytes: int) -> dict:
+    """Build the HF LFS metadata payload."""
+    return {
+        "oid": checksum,
+        "size": size_bytes,
+        "pointerSize": 134,
+    }
+
+
+def _build_public_link(
+    request: Request,
+    *,
+    limit: int,
+    cursor: str,
+) -> str:
+    """Build a public-facing pagination link using the configured base URL."""
+    query_params = dict(request.query_params)
+    query_params["limit"] = str(limit)
+    query_params["cursor"] = cursor
+    return f"{cfg.app.base_url.rstrip('/')}{request.url.path}?{urlencode(query_params)}"
+
+
+def _build_file_record_map(
+    repository: Repository, paths: list[str]
+) -> dict[str, File]:
+    """Fetch file records in one query for the provided paths."""
+    if not paths:
+        return {}
+
+    query = File.select().where(
+        (File.repository == repository)
+        & (File.path_in_repo.in_(paths))
+        & (File.is_deleted == False)
+    )
+    return {file.path_in_repo: file for file in query}
+
+
+async def fetch_lakefs_objects_page(
+    lakefs_repo: str,
+    revision: str,
+    prefix: str,
+    recursive: bool,
+    amount: int,
+    after: str | None = None,
+) -> dict:
+    """Fetch one page of objects from LakeFS."""
+    client = get_lakefs_client()
+    return await client.list_objects(
+        repository=lakefs_repo,
+        ref=revision,
+        prefix=prefix,
+        delimiter="" if recursive else "/",
+        amount=amount,
+        after=after or "",
+    )
+
+
+def _make_tree_item(
+    obj: dict,
+    repository: Repository,
+    file_records: dict[str, File],
+    expand: bool,
+    last_commit: dict | None = None,
+) -> dict:
+    """Convert a LakeFS object/common_prefix to the HF tree wire format."""
+    path = _normalize_repo_path(obj["path"])
+    item_type = obj["path_type"]
+
+    if item_type == "object":
+        size_bytes = obj.get("size_bytes") or 0
+        file_record = file_records.get(path)
+        checksum = (
+            file_record.sha256 if file_record and file_record.sha256 else obj.get("checksum", "")
+        )
+        is_lfs = (
+            file_record.lfs
+            if file_record is not None
+            else should_use_lfs(repository, path, size_bytes)
+        )
+
+        file_obj = {
+            "type": "file",
+            "oid": checksum,
+            "size": size_bytes,
+            "path": path,
        }

-    return file_obj
+        last_modified = _format_last_modified(obj.get("mtime"))
+        if last_modified:
+            file_obj["lastModified"] = last_modified

+        if is_lfs:
+            file_obj["lfs"] = _build_lfs_payload(checksum, size_bytes)

-async def convert_directory_object(
-    obj, lakefs_repo: str, revision: str
-) -> dict:
-    """Convert LakeFS directory object to HuggingFace format.
+        if expand:
+            file_obj["lastCommit"] = last_commit
+            file_obj["securityFileStatus"] = None

-    Args:
-        obj: LakeFS common_prefix object dict
-        lakefs_repo: LakeFS repository name
-        revision: Branch or commit
-
-    Returns:
-        HuggingFace formatted directory object
-    """
-    # Use full path relative to repository root (HuggingFace spec)
-    dir_path = obj["path"]
-
-    # Calculate folder stats
-    folder_size, folder_latest_mtime = await calculate_folder_stats(
-        lakefs_repo, revision, obj["path"]
-    )
+        return file_obj

    dir_obj = {
        "type": "directory",
        "oid": obj.get("checksum", ""),
-        "size": folder_size,
-        "path": dir_path.rstrip("/"),  # Remove trailing slash
+        "size": 0,
+        "path": path,
    }

-    # Add last modified info
-    if folder_latest_mtime:
-        dir_obj["lastModified"] = datetime.fromtimestamp(folder_latest_mtime).strftime(
-            DATETIME_FORMAT_ISO
-        )
-    elif obj.get("mtime"):
-        dir_obj["lastModified"] = datetime.fromtimestamp(obj["mtime"]).strftime(
-            DATETIME_FORMAT_ISO
-        )
+    last_modified = _format_last_modified(obj.get("mtime"))
+    if last_modified:
+        dir_obj["lastModified"] = last_modified
+
+    if expand:
+        dir_obj["lastCommit"] = last_commit

    return dir_obj


+def _apply_changed_path(
+    changed_path: str,
+    unresolved_files: set[str],
+    unresolved_directories: set[str],
+    resolved: dict[str, dict | None],
+    commit_info: dict,
+) -> None:
+    """Resolve file and ancestor directory targets touched by a diff path."""
+    normalized_path = _normalize_repo_path(changed_path)
+    if not normalized_path:
+        return
+
+    if normalized_path in unresolved_files:
+        unresolved_files.remove(normalized_path)
+        resolved[normalized_path] = commit_info
+
+    if normalized_path in unresolved_directories:
+        unresolved_directories.remove(normalized_path)
+        resolved[normalized_path] = commit_info
+
+    ancestor = normalized_path
+    while "/" in ancestor and unresolved_directories:
+        ancestor = ancestor.rsplit("/", 1)[0]
+        if ancestor in unresolved_directories:
+            unresolved_directories.remove(ancestor)
+            resolved[ancestor] = commit_info
+
+
+async def resolve_last_commits_for_paths(
+    lakefs_repo: str,
+    revision: str,
+    targets: list[dict[str, str]],
+) -> dict[str, dict | None]:
+    """Resolve the latest commit touching each target path."""
+    unresolved_files = {
+        target["path"] for target in targets if target["type"] == "file" and target["path"]
+    }
+    unresolved_directories = {
+        target["path"]
+        for target in targets
+        if target["type"] == "directory" and target["path"]
+    }
+    if not unresolved_files and not unresolved_directories:
+        return {}
+
+    client = get_lakefs_rest_client()
+    resolved: dict[str, dict | None] = {}
+    commit_cursor: str | None = None
+
+    while unresolved_files or unresolved_directories:
+        log_result = await client.log_commits(
+            repository=lakefs_repo,
+            ref=revision,
+            after=commit_cursor,
+            amount=TREE_COMMIT_SCAN_PAGE_SIZE,
+        )
+        commits = log_result.get("results", [])
+        if not commits:
+            break
+
+        for commit in commits:
+            commit_info = _serialize_last_commit(commit)
+            parent_ids = commit.get("parents") or []
+            parent_id = parent_ids[0] if parent_ids else None
+
+            if not parent_id:
+                for path in unresolved_files:
+                    resolved[path] = commit_info
+                for path in unresolved_directories:
+                    resolved[path] = commit_info
+                unresolved_files.clear()
+                unresolved_directories.clear()
+                break
+
+            diff_cursor: str | None = None
+            while unresolved_files or unresolved_directories:
+                diff_result = await client.diff_refs(
+                    repository=lakefs_repo,
+                    left_ref=parent_id,
+                    right_ref=commit["id"],
+                    after=diff_cursor,
+                    amount=TREE_DIFF_PAGE_SIZE,
+                )
+
+                for entry in diff_result.get("results", []):
+                    diff_path = entry.get("path")
+                    if diff_path:
+                        _apply_changed_path(
+                            diff_path,
+                            unresolved_files,
+                            unresolved_directories,
+                            resolved,
+                            commit_info,
+                        )
+                    if not unresolved_files and not unresolved_directories:
+                        break
+
+                pagination = diff_result.get("pagination") or {}
+                if (
+                    not pagination.get("has_more")
+                    or (not unresolved_files and not unresolved_directories)
+                ):
+                    break
+                diff_cursor = pagination.get("next_offset")
+
+            if not unresolved_files and not unresolved_directories:
+                break
+
+        pagination = log_result.get("pagination") or {}
+        if not pagination.get("has_more") or (
+            not unresolved_files and not unresolved_directories
+        ):
+            break
+        commit_cursor = pagination.get("next_offset")
+
+    return resolved
+
+
+async def _process_single_path(
+    lakefs_repo: str,
+    revision: str,
+    repository: Repository,
+    clean_path: str,
+    file_records: dict[str, File],
+    semaphore: asyncio.Semaphore,
+) -> dict | None:
+    """Resolve one path to either a file or directory entry."""
+    client = get_lakefs_client()
+
+    async with semaphore:
+        try:
+            obj_stats = await client.stat_object(
+                repository=lakefs_repo,
+                ref=revision,
+                path=clean_path,
+            )
+
+            file_record = file_records.get(clean_path)
+            checksum = (
+                file_record.sha256
+                if file_record and file_record.sha256
+                else obj_stats.get("checksum", "")
+            )
+            size_bytes = obj_stats.get("size_bytes") or 0
+            is_lfs = (
+                file_record.lfs
+                if file_record is not None
+                else should_use_lfs(repository, clean_path, size_bytes)
+            )
+
+            file_info = {
+                "type": "file",
+                "path": clean_path,
+                "size": size_bytes,
+                "oid": checksum,
+            }
+
+            last_modified = _format_last_modified(obj_stats.get("mtime"))
+            if last_modified:
+                file_info["lastModified"] = last_modified
+
+            if is_lfs:
+                file_info["lfs"] = _build_lfs_payload(checksum, size_bytes)
+
+            return file_info
+        except Exception as error:
+            if not is_lakefs_not_found_error(error):
+                logger.debug(f"Failed to stat path {clean_path}: {error}")
+                return None
+
+        try:
+            list_result = await client.list_objects(
+                repository=lakefs_repo,
+                ref=revision,
+                prefix=f"{clean_path}/",
+                amount=1,
+            )
+        except Exception as error:
+            if not is_lakefs_not_found_error(error):
+                logger.debug(f"Failed to inspect directory path {clean_path}: {error}")
+            return None
+
+        if not list_result.get("results"):
+            return None
+
+        first_result = list_result["results"][0]
+        dir_info = {
+            "type": "directory",
+            "path": clean_path,
+            "oid": first_result.get("checksum", ""),
+            "size": 0,
+        }
+
+        last_modified = _format_last_modified(first_result.get("mtime"))
+        if last_modified:
+            dir_info["lastModified"] = last_modified
+
+        return dir_info
+
+
@router.get("/{repo_type}s/{namespace}/{repo_name}/tree/{revision}{path:path}")
@with_repo_fallback("tree")
 async def list_repo_tree(
@@ -229,81 +405,108 @@ async def list_repo_tree(
    path: str = "",
    recursive: bool = False,
    expand: bool = False,
+    limit: int | None = Query(default=None, ge=1),
+    cursor: str | None = None,
    fallback: bool = True,
    user: User | None = Depends(get_optional_user),
 ):
-    """List repository file tree.
-
-    Returns a flat list of files and folders in HuggingFace format.
-
-    Args:
-        repo_type: Type of repository
-        namespace: Repository namespace
-        repo_name: Repository name
-        revision: Branch name or commit hash (default: "main")
-        path: Path within repository (default: root)
-        recursive: List recursively (default: False)
-        expand: Include detailed metadata (default: False)
-        user: Current authenticated user (optional)
-
-    Returns:
-        Flat list of file/folder objects
-    """
-    # Construct full repo ID
+    """List repository file tree."""
    repo_id = f"{namespace}/{repo_name}"
-
-    # Check if repository exists using get_repository
    repo_row = get_repository(repo_type, namespace, repo_name)

    if not repo_row:
        return hf_repo_not_found(repo_id, repo_type)

-    # Check read permission for private repos
    check_repo_read_permission(repo_row, user)

    lakefs_repo = lakefs_repo_name(repo_type, repo_id)
+    clean_path = _normalize_repo_path(path)
+    prefix = f"{clean_path}/" if clean_path else ""
+    default_limit = TREE_EXPAND_PAGE_SIZE if expand else TREE_PAGE_SIZE
+    page_size = min(limit or default_limit, default_limit)

-    # Clean path - ensure it ends with / if not empty
-    prefix = path.lstrip("/") if path and path != "/" else ""
-    if prefix and not prefix.endswith("/"):
-        prefix += "/"
-
-    # Fetch all objects from LakeFS
    try:
-        all_results = await fetch_lakefs_objects(
-            lakefs_repo, revision, prefix, recursive
+        resolved_revision, _ = await resolve_revision(
+            get_lakefs_client(), lakefs_repo, revision
        )
-    except Exception as e:
-        # Check for specific error types
-        if is_lakefs_not_found_error(e):
-            if is_lakefs_revision_error(e):
+    except Exception:
+        return hf_revision_not_found(repo_id, revision)
+
+    try:
+        page = await fetch_lakefs_objects_page(
+            lakefs_repo=lakefs_repo,
+            revision=resolved_revision,
+            prefix=prefix,
+            recursive=recursive,
+            amount=page_size,
+            after=cursor,
+        )
+    except Exception as error:
+        if is_lakefs_not_found_error(error):
+            if is_lakefs_revision_error(error):
                return hf_revision_not_found(repo_id, revision)
-            else:
-                # Return empty list for non-existent paths
-                return []
+            return hf_entry_not_found(repo_id, clean_path or "/", revision)

-        # Other errors are server errors
-        logger.exception(f"Failed to list objects for {repo_id}", e)
-        return hf_server_error(f"Failed to list objects: {str(e)}")
+        logger.exception(f"Failed to list objects for {repo_id}", error)
+        return hf_server_error(f"Failed to list objects: {str(error)}")

-    # Convert LakeFS objects to HuggingFace format
-    result_list = []
+    page_results = page.get("results", [])
+    if clean_path and not page_results:
+        return hf_entry_not_found(repo_id, clean_path, revision)

-    for obj in all_results:
-        match obj["path_type"]:
-            case "object":
-                # File object - pass Repository FK instead of repo_id
-                file_obj = await convert_file_object(obj, repo_row)
-                result_list.append(file_obj)
+    file_paths = [
+        _normalize_repo_path(obj["path"])
+        for obj in page_results
+        if obj.get("path_type") == "object"
+    ]
+    file_records = _build_file_record_map(repo_row, file_paths)

-            case "common_prefix":
-                # Directory object
-                dir_obj = await convert_directory_object(
-                    obj, lakefs_repo, revision
-                )
-                result_list.append(dir_obj)
+    last_commit_map: dict[str, dict | None] = {}
+    if expand and page_results:
+        targets = [
+            {
+                "path": _normalize_repo_path(obj["path"]),
+                "type": "file"
+                if obj.get("path_type") == "object"
+                else "directory",
+            }
+            for obj in page_results
+        ]
+        try:
+            last_commit_map = await resolve_last_commits_for_paths(
+                lakefs_repo=lakefs_repo,
+                revision=resolved_revision,
+                targets=targets,
+            )
+        except Exception as error:
+            if is_lakefs_not_found_error(error) and is_lakefs_revision_error(error):
+                return hf_revision_not_found(repo_id, revision)
+            logger.warning(
+                f"Failed to expand last commit data for {repo_id}/{revision}: {error}"
+            )

-    return result_list
+    result_list = [
+        _make_tree_item(
+            obj=obj,
+            repository=repo_row,
+            file_records=file_records,
+            expand=expand,
+            last_commit=last_commit_map.get(_normalize_repo_path(obj["path"])),
+        )
+        for obj in page_results
+    ]
+
+    headers = {}
+    pagination = page.get("pagination") or {}
+    if pagination.get("has_more") and pagination.get("next_offset"):
+        next_url = _build_public_link(
+            request=request,
+            limit=page_size,
+            cursor=pagination["next_offset"],
+        )
+        headers["Link"] = f'<{next_url}>; rel="next"'
+
+    return JSONResponse(content=result_list, headers=headers)


@router.post("/{repo_type}s/{namespace}/{repo_name}/paths-info/{revision}")
@@ -319,123 +522,77 @@ async def get_paths_info(
    fallback: bool = True,
    user: User | None = Depends(get_optional_user),
 ):
-    """Get information about specific paths in a repository.
-
-    This endpoint matches HuggingFace Hub API format.
-
-    Args:
-        repo_type: Type of repository (model/dataset/space)
-        namespace: Repository namespace
-        repo_name: Repository name
-        revision: Branch name or commit hash
-        paths: List of paths to get information about
-        expand: Whether to fetch extended metadata
-        user: Current authenticated user (optional)
-
-    Returns:
-        List of path information objects (files and folders)
-    """
-    # Construct full repo ID
+    """Get information about specific paths in a repository."""
    repo_id = f"{namespace}/{repo_name}"
-
-    # Check if repository exists using get_repository
    repo_row = get_repository(repo_type, namespace, repo_name)

    if not repo_row:
        return hf_repo_not_found(repo_id, repo_type)

-    # Check read permission for private repos
    check_repo_read_permission(repo_row, user)

+    normalized_paths = [_normalize_repo_path(path) for path in paths if path]
+    if len(normalized_paths) > PATHS_INFO_MAX_PATHS:
+        return hf_bad_request(
+            f"Too many paths requested. Maximum supported paths per request is {PATHS_INFO_MAX_PATHS}."
+        )
+
    lakefs_repo = lakefs_repo_name(repo_type, repo_id)
+    try:
+        resolved_revision, _ = await resolve_revision(
+            get_lakefs_client(), lakefs_repo, revision
+        )
+    except Exception:
+        return hf_revision_not_found(repo_id, revision)

-    # Helper function to process a single path
-    async def process_path(path: str) -> dict | None:
-        """Process single path and return its info, or None if path doesn't exist."""
-        clean_path = path.lstrip("/")
+    file_records = _build_file_record_map(repo_row, normalized_paths)
+    semaphore = asyncio.Semaphore(PATHS_INFO_CONCURRENCY)

+    try:
+        results = await asyncio.gather(
+            *[
+                _process_single_path(
+                    lakefs_repo=lakefs_repo,
+                    revision=resolved_revision,
+                    repository=repo_row,
+                    clean_path=clean_path,
+                    file_records=file_records,
+                    semaphore=semaphore,
+                )
+                for clean_path in normalized_paths
+            ]
+        )
+    except Exception as error:
+        if is_lakefs_not_found_error(error):
+            if is_lakefs_revision_error(error):
+                return hf_revision_not_found(repo_id, revision)
+            return []
+        logger.exception(f"Failed to fetch paths info for {repo_id}", error)
+        return hf_server_error(f"Failed to fetch paths info: {str(error)}")
+
+    existing_entries = [entry for entry in results if entry is not None]
+
+    if expand and existing_entries:
+        targets = [
+            {"path": entry["path"], "type": entry["type"]} for entry in existing_entries
+        ]
        try:
-            # Try to get object stats
-            client = get_lakefs_client()
-            obj_stats = await client.stat_object(
-                repository=lakefs_repo,
-                ref=revision,
-                path=clean_path,
+            last_commit_map = await resolve_last_commits_for_paths(
+                lakefs_repo=lakefs_repo,
+                revision=resolved_revision,
+                targets=targets,
            )
-
-            # It's a file - use repo-specific LFS settings
-            is_lfs = should_use_lfs(repo_row, clean_path, obj_stats["size_bytes"])
-
-            # Get correct checksum from database using repository FK
-            file_record = get_file(repo_row, clean_path)
-
-            checksum = (
-                file_record.sha256
-                if file_record and file_record.sha256
-                else obj_stats["checksum"]
+        except Exception as error:
+            if is_lakefs_not_found_error(error) and is_lakefs_revision_error(error):
+                return hf_revision_not_found(repo_id, revision)
+            logger.warning(
+                f"Failed to expand paths-info commit data for {repo_id}/{revision}: {error}"
            )
+            last_commit_map = {}

-            file_info = {
-                "type": "file",
-                "path": clean_path,
-                "size": obj_stats["size_bytes"],
-                "oid": checksum,  # Git blob SHA1 for non-LFS, SHA256 for LFS
-                "lfs": None,
-                "last_commit": None,
-                "security": None,
-            }
+        for entry in existing_entries:
+            entry["lastCommit"] = last_commit_map.get(entry["path"])
+            if entry["type"] == "file":
+                entry["securityFileStatus"] = None

-            # Add LFS metadata if applicable
-            if is_lfs:
-                file_info["lfs"] = {
-                    "oid": checksum,  # SHA256 for LFS files
-                    "size": obj_stats["size_bytes"],
-                    "pointerSize": 134,
-                }
-
-            return file_info
-
-        except Exception as e:
-            # Check if it might be a directory by trying to list with prefix
-            if is_lakefs_not_found_error(e):
-                try:
-                    # Try to list objects with this path as prefix
-                    client = get_lakefs_client()
-                    prefix = (
-                        clean_path if clean_path.endswith("/") else clean_path + "/"
-                    )
-                    list_result = await client.list_objects(
-                        repository=lakefs_repo,
-                        ref=revision,
-                        prefix=prefix,
-                        amount=1,  # Just check if any objects exist
-                    )
-
-                    # If we get results, it's a directory
-                    if list_result["results"]:
-                        # Try to get an oid from the first result if available
-                        oid = ""
-                        if list_result["results"]:
-                            oid = list_result["results"][0].get("checksum", "")
-
-                        return {
-                            "type": "directory",
-                            "path": clean_path,
-                            "oid": oid,
-                            "tree_id": oid,
-                            "last_commit": None,
-                        }
-                    # Path doesn't exist, return None
-                    return None
-                except Exception as ex:
-                    # Path doesn't exist, skip it (as per HF behavior)
-                    logger.debug(f"Path {clean_path} doesn't exist or is invalid")
-                    return None
-            # For other errors, also return None
-            return None
-
-    # Process all paths in parallel
-    results = await asyncio.gather(*[process_path(path) for path in paths])
-
-    # Filter out None values (paths that don't exist)
-    return [r for r in results if r is not None]
+    return existing_entries