update etag/oid and download filename process

This commit is contained in:
Kohaku-Blueleaf
2025-10-04 19:19:53 +08:00
parent 7c83a118af
commit 5c00f03808
4 changed files with 96 additions and 26 deletions

View File

@@ -398,9 +398,21 @@ async def list_repo_tree(
# Remove prefix from path to get relative path
relative_path = obj.path[prefix_len:] if prefix else obj.path
# Get correct checksum from database
# sha256 column stores: git blob SHA1 for non-LFS, SHA256 for LFS
file_record = File.get_or_none(
(File.repo_full_id == repo_id) & (File.path_in_repo == obj.path)
)
checksum = (
file_record.sha256
if file_record and file_record.sha256
else obj.checksum
)
file_obj = {
"type": "file",
"oid": obj.checksum,
"oid": checksum, # Git blob SHA1 for non-LFS, SHA256 for LFS
"size": obj.size_bytes,
"path": relative_path,
}
@@ -408,7 +420,7 @@ async def list_repo_tree(
# Add LFS metadata if it's an LFS file
if is_lfs:
file_obj["lfs"] = {
"oid": obj.checksum, # Use checksum as LFS oid
"oid": checksum, # SHA256 for LFS files
"size": obj.size_bytes,
"pointerSize": 134, # Standard Git LFS pointer size
}
@@ -505,11 +517,23 @@ async def get_paths_info(
# It's a file
is_lfs = obj_stats.size_bytes > cfg.app.lfs_threshold_bytes
# Get correct checksum from database
# sha256 column stores: git blob SHA1 for non-LFS, SHA256 for LFS
file_record = File.get_or_none(
(File.repo_full_id == repo_id) & (File.path_in_repo == clean_path)
)
checksum = (
file_record.sha256
if file_record and file_record.sha256
else obj_stats.checksum
)
file_info = {
"type": "file",
"path": clean_path,
"size": obj_stats.size_bytes,
"blob_id": obj_stats.checksum,
"blob_id": checksum, # Git blob SHA1 for non-LFS, SHA256 for LFS
"lfs": None,
"last_commit": None,
"security": None,
@@ -518,7 +542,7 @@ async def get_paths_info(
# Add LFS metadata if applicable
if is_lfs:
file_info["lfs"] = {
"oid": obj_stats.checksum,
"oid": checksum, # SHA256 for LFS files
"size": obj_stats.size_bytes,
"pointerSize": 134,
}

View File

@@ -11,6 +11,25 @@ from typing import Any, Dict, List, Optional
from fastapi import APIRouter, Depends, HTTPException, Request, Header
from lakefs_client.models import CommitCreation, StagingLocation, StagingMetadata
def calculate_git_blob_sha1(content: bytes) -> str:
"""Calculate SHA1 hash in git blob format.
Git uses: sha1(f'blob {size}\\0' + content)
Args:
content: File content bytes
Returns:
SHA1 hex digest
"""
size = len(content)
sha = hashlib.sha1()
sha.update(f"blob {size}\0".encode("utf-8"))
sha.update(content)
return sha.hexdigest()
from ..config import cfg
from ..db import File, Repository, StagingUpload, User
from .auth import get_current_user, get_optional_user
@@ -354,20 +373,23 @@ async def resolve_file(
# Prepare headers required by HuggingFace client
file_size = obj_stat.size_bytes
file_checksum = obj_stat.checksum # This is the ETag from LakeFS
# Normalize ETag (add quotes if not present)
if file_checksum and not file_checksum.startswith('"'):
etag_value = f'"{file_checksum}"'
else:
etag_value = file_checksum or '""'
# Get correct checksum from database
# sha256 column stores: git blob SHA1 for non-LFS, SHA256 for LFS
file_record = File.get_or_none(
(File.repo_full_id == repo_id) & (File.path_in_repo == path)
)
# HuggingFace expects plain SHA256 hex (64 characters, unquoted)
# For non-LFS: use git blob SHA1, for LFS: use SHA256
etag_value = file_record.sha256 if file_record and file_record.sha256 else ""
response_headers = {
# Critical headers for HuggingFace client
"X-Repo-Commit": commit_hash or "",
"X-Linked-Etag": etag_value,
"X-Linked-Etag": etag_value, # Plain hex, not quoted
"X-Linked-Size": str(file_size) if file_size else "0",
"ETag": etag_value,
"ETag": etag_value, # Plain hex, not quoted
"Content-Length": str(file_size) if file_size else "0",
"Accept-Ranges": "bytes", # Support resume
# Additional useful headers
@@ -502,7 +524,8 @@ async def commit(
400, detail={"error": f"Failed to decode base64: {e}"}
)
new_sha256 = hashlib.sha256(data).hexdigest()
# Calculate git blob SHA1 for non-LFS files (HuggingFace format)
git_blob_sha1 = calculate_git_blob_sha1(data)
# Check if file unchanged (deduplication)
existing = File.get_or_none(
@@ -510,7 +533,7 @@ async def commit(
)
if (
existing
and existing.sha256 == new_sha256
and existing.sha256 == git_blob_sha1
and existing.size == len(data)
):
print(f"Skipping unchanged file: {path}")
@@ -533,17 +556,17 @@ async def commit(
500, detail={"error": f"Failed to upload {path}: {e}"}
)
# Update database
# Update database - store git blob SHA1 in sha256 column for non-LFS files
File.insert(
repo_full_id=repo_id,
path_in_repo=path,
size=len(data),
sha256=new_sha256,
sha256=git_blob_sha1, # Git blob SHA1 for non-LFS files
lfs=False,
).on_conflict(
conflict_target=(File.repo_full_id, File.path_in_repo),
update={
File.sha256: new_sha256,
File.sha256: git_blob_sha1, # Git blob SHA1 for non-LFS files
File.size: len(data),
File.updated_at: datetime.now(timezone.utc),
},

View File

@@ -4,6 +4,7 @@ This module implements the Git LFS Batch API specification for handling
large file uploads (>10MB). It provides presigned S3 URLs for direct uploads.
"""
import base64
from datetime import datetime, timedelta, timezone
from typing import List, Optional
@@ -180,11 +181,17 @@ async def lfs_batch(
else:
# Single PUT upload
try:
# Convert SHA256 hex to base64 for S3 checksum verification
checksum_sha256 = base64.b64encode(bytes.fromhex(oid)).decode(
"utf-8"
)
upload_info = generate_upload_presigned_url(
bucket=cfg.s3.bucket,
key=lfs_key,
expires_in=3600, # 1 hour
content_type="application/octet-stream",
checksum_sha256=checksum_sha256,
)
objects_response.append(

View File

@@ -82,7 +82,7 @@ def generate_download_presigned_url(
# Add Content-Disposition if filename specified
if filename:
params["ResponseContentDisposition"] = f'attachment; filename="{filename}"'
params["ResponseContentDisposition"] = f'inline; filename="{filename}"'
url = s3.generate_presigned_url(
"get_object",
@@ -98,6 +98,7 @@ def generate_upload_presigned_url(
key: str,
expires_in: int = 3600,
content_type: str = "application/octet-stream",
checksum_sha256: str = None,
) -> dict:
"""Generate presigned URL for uploading to S3.
@@ -106,20 +107,29 @@ def generate_upload_presigned_url(
key: Object key in S3
expires_in: URL expiration time in seconds (default: 1 hour)
content_type: Content type of the object
checksum_sha256: Base64-encoded SHA256 checksum for S3 to verify (optional)
Returns:
Dict with 'url', 'fields', and 'expires_at'
"""
s3 = get_s3_client()
# Prepare params for presigned URL
params = {
"Bucket": bucket,
"Key": key,
# "ContentType": content_type,
}
# Add SHA256 checksum if provided (for LFS files)
# S3 will verify the checksum automatically
if checksum_sha256:
params["ChecksumSHA256"] = checksum_sha256
# Generate presigned PUT URL
url = s3.generate_presigned_url(
"put_object",
Params={
"Bucket": bucket,
"Key": key,
# "ContentType": content_type,
},
Params=params,
ExpiresIn=expires_in,
HttpMethod="PUT",
)
@@ -129,13 +139,19 @@ def generate_upload_presigned_url(
"%Y-%m-%dT%H:%M:%S.%fZ"
)
headers = {
"Content-Type": content_type,
}
# If checksum is required, client must send it
if checksum_sha256:
headers["x-amz-checksum-sha256"] = checksum_sha256
return {
"url": url.replace(cfg.s3.endpoint, cfg.s3.public_endpoint),
"expires_at": expires_at,
"method": "PUT",
"headers": {
"Content-Type": content_type,
},
"headers": headers,
}