mirror of
https://github.com/KohakuBlueleaf/KohakuHub.git
synced 2026-04-29 19:08:58 -05:00
update etag/oid and download filename process
This commit is contained in:
@@ -398,9 +398,21 @@ async def list_repo_tree(
|
||||
# Remove prefix from path to get relative path
|
||||
relative_path = obj.path[prefix_len:] if prefix else obj.path
|
||||
|
||||
# Get correct checksum from database
|
||||
# sha256 column stores: git blob SHA1 for non-LFS, SHA256 for LFS
|
||||
file_record = File.get_or_none(
|
||||
(File.repo_full_id == repo_id) & (File.path_in_repo == obj.path)
|
||||
)
|
||||
|
||||
checksum = (
|
||||
file_record.sha256
|
||||
if file_record and file_record.sha256
|
||||
else obj.checksum
|
||||
)
|
||||
|
||||
file_obj = {
|
||||
"type": "file",
|
||||
"oid": obj.checksum,
|
||||
"oid": checksum, # Git blob SHA1 for non-LFS, SHA256 for LFS
|
||||
"size": obj.size_bytes,
|
||||
"path": relative_path,
|
||||
}
|
||||
@@ -408,7 +420,7 @@ async def list_repo_tree(
|
||||
# Add LFS metadata if it's an LFS file
|
||||
if is_lfs:
|
||||
file_obj["lfs"] = {
|
||||
"oid": obj.checksum, # Use checksum as LFS oid
|
||||
"oid": checksum, # SHA256 for LFS files
|
||||
"size": obj.size_bytes,
|
||||
"pointerSize": 134, # Standard Git LFS pointer size
|
||||
}
|
||||
@@ -505,11 +517,23 @@ async def get_paths_info(
|
||||
# It's a file
|
||||
is_lfs = obj_stats.size_bytes > cfg.app.lfs_threshold_bytes
|
||||
|
||||
# Get correct checksum from database
|
||||
# sha256 column stores: git blob SHA1 for non-LFS, SHA256 for LFS
|
||||
file_record = File.get_or_none(
|
||||
(File.repo_full_id == repo_id) & (File.path_in_repo == clean_path)
|
||||
)
|
||||
|
||||
checksum = (
|
||||
file_record.sha256
|
||||
if file_record and file_record.sha256
|
||||
else obj_stats.checksum
|
||||
)
|
||||
|
||||
file_info = {
|
||||
"type": "file",
|
||||
"path": clean_path,
|
||||
"size": obj_stats.size_bytes,
|
||||
"blob_id": obj_stats.checksum,
|
||||
"blob_id": checksum, # Git blob SHA1 for non-LFS, SHA256 for LFS
|
||||
"lfs": None,
|
||||
"last_commit": None,
|
||||
"security": None,
|
||||
@@ -518,7 +542,7 @@ async def get_paths_info(
|
||||
# Add LFS metadata if applicable
|
||||
if is_lfs:
|
||||
file_info["lfs"] = {
|
||||
"oid": obj_stats.checksum,
|
||||
"oid": checksum, # SHA256 for LFS files
|
||||
"size": obj_stats.size_bytes,
|
||||
"pointerSize": 134,
|
||||
}
|
||||
|
||||
@@ -11,6 +11,25 @@ from typing import Any, Dict, List, Optional
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request, Header
|
||||
from lakefs_client.models import CommitCreation, StagingLocation, StagingMetadata
|
||||
|
||||
|
||||
def calculate_git_blob_sha1(content: bytes) -> str:
|
||||
"""Calculate SHA1 hash in git blob format.
|
||||
|
||||
Git uses: sha1(f'blob {size}\\0' + content)
|
||||
|
||||
Args:
|
||||
content: File content bytes
|
||||
|
||||
Returns:
|
||||
SHA1 hex digest
|
||||
"""
|
||||
size = len(content)
|
||||
sha = hashlib.sha1()
|
||||
sha.update(f"blob {size}\0".encode("utf-8"))
|
||||
sha.update(content)
|
||||
return sha.hexdigest()
|
||||
|
||||
|
||||
from ..config import cfg
|
||||
from ..db import File, Repository, StagingUpload, User
|
||||
from .auth import get_current_user, get_optional_user
|
||||
@@ -354,20 +373,23 @@ async def resolve_file(
|
||||
|
||||
# Prepare headers required by HuggingFace client
|
||||
file_size = obj_stat.size_bytes
|
||||
file_checksum = obj_stat.checksum # This is the ETag from LakeFS
|
||||
|
||||
# Normalize ETag (add quotes if not present)
|
||||
if file_checksum and not file_checksum.startswith('"'):
|
||||
etag_value = f'"{file_checksum}"'
|
||||
else:
|
||||
etag_value = file_checksum or '""'
|
||||
# Get correct checksum from database
|
||||
# sha256 column stores: git blob SHA1 for non-LFS, SHA256 for LFS
|
||||
file_record = File.get_or_none(
|
||||
(File.repo_full_id == repo_id) & (File.path_in_repo == path)
|
||||
)
|
||||
|
||||
# HuggingFace expects plain SHA256 hex (64 characters, unquoted)
|
||||
# For non-LFS: use git blob SHA1, for LFS: use SHA256
|
||||
etag_value = file_record.sha256 if file_record and file_record.sha256 else ""
|
||||
|
||||
response_headers = {
|
||||
# Critical headers for HuggingFace client
|
||||
"X-Repo-Commit": commit_hash or "",
|
||||
"X-Linked-Etag": etag_value,
|
||||
"X-Linked-Etag": etag_value, # Plain hex, not quoted
|
||||
"X-Linked-Size": str(file_size) if file_size else "0",
|
||||
"ETag": etag_value,
|
||||
"ETag": etag_value, # Plain hex, not quoted
|
||||
"Content-Length": str(file_size) if file_size else "0",
|
||||
"Accept-Ranges": "bytes", # Support resume
|
||||
# Additional useful headers
|
||||
@@ -502,7 +524,8 @@ async def commit(
|
||||
400, detail={"error": f"Failed to decode base64: {e}"}
|
||||
)
|
||||
|
||||
new_sha256 = hashlib.sha256(data).hexdigest()
|
||||
# Calculate git blob SHA1 for non-LFS files (HuggingFace format)
|
||||
git_blob_sha1 = calculate_git_blob_sha1(data)
|
||||
|
||||
# Check if file unchanged (deduplication)
|
||||
existing = File.get_or_none(
|
||||
@@ -510,7 +533,7 @@ async def commit(
|
||||
)
|
||||
if (
|
||||
existing
|
||||
and existing.sha256 == new_sha256
|
||||
and existing.sha256 == git_blob_sha1
|
||||
and existing.size == len(data)
|
||||
):
|
||||
print(f"Skipping unchanged file: {path}")
|
||||
@@ -533,17 +556,17 @@ async def commit(
|
||||
500, detail={"error": f"Failed to upload {path}: {e}"}
|
||||
)
|
||||
|
||||
# Update database
|
||||
# Update database - store git blob SHA1 in sha256 column for non-LFS files
|
||||
File.insert(
|
||||
repo_full_id=repo_id,
|
||||
path_in_repo=path,
|
||||
size=len(data),
|
||||
sha256=new_sha256,
|
||||
sha256=git_blob_sha1, # Git blob SHA1 for non-LFS files
|
||||
lfs=False,
|
||||
).on_conflict(
|
||||
conflict_target=(File.repo_full_id, File.path_in_repo),
|
||||
update={
|
||||
File.sha256: new_sha256,
|
||||
File.sha256: git_blob_sha1, # Git blob SHA1 for non-LFS files
|
||||
File.size: len(data),
|
||||
File.updated_at: datetime.now(timezone.utc),
|
||||
},
|
||||
|
||||
@@ -4,6 +4,7 @@ This module implements the Git LFS Batch API specification for handling
|
||||
large file uploads (>10MB). It provides presigned S3 URLs for direct uploads.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import List, Optional
|
||||
|
||||
@@ -180,11 +181,17 @@ async def lfs_batch(
|
||||
else:
|
||||
# Single PUT upload
|
||||
try:
|
||||
# Convert SHA256 hex to base64 for S3 checksum verification
|
||||
checksum_sha256 = base64.b64encode(bytes.fromhex(oid)).decode(
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
upload_info = generate_upload_presigned_url(
|
||||
bucket=cfg.s3.bucket,
|
||||
key=lfs_key,
|
||||
expires_in=3600, # 1 hour
|
||||
content_type="application/octet-stream",
|
||||
checksum_sha256=checksum_sha256,
|
||||
)
|
||||
|
||||
objects_response.append(
|
||||
|
||||
@@ -82,7 +82,7 @@ def generate_download_presigned_url(
|
||||
|
||||
# Add Content-Disposition if filename specified
|
||||
if filename:
|
||||
params["ResponseContentDisposition"] = f'attachment; filename="{filename}"'
|
||||
params["ResponseContentDisposition"] = f'inline; filename="{filename}"'
|
||||
|
||||
url = s3.generate_presigned_url(
|
||||
"get_object",
|
||||
@@ -98,6 +98,7 @@ def generate_upload_presigned_url(
|
||||
key: str,
|
||||
expires_in: int = 3600,
|
||||
content_type: str = "application/octet-stream",
|
||||
checksum_sha256: str = None,
|
||||
) -> dict:
|
||||
"""Generate presigned URL for uploading to S3.
|
||||
|
||||
@@ -106,20 +107,29 @@ def generate_upload_presigned_url(
|
||||
key: Object key in S3
|
||||
expires_in: URL expiration time in seconds (default: 1 hour)
|
||||
content_type: Content type of the object
|
||||
checksum_sha256: Base64-encoded SHA256 checksum for S3 to verify (optional)
|
||||
|
||||
Returns:
|
||||
Dict with 'url', 'fields', and 'expires_at'
|
||||
"""
|
||||
s3 = get_s3_client()
|
||||
|
||||
# Prepare params for presigned URL
|
||||
params = {
|
||||
"Bucket": bucket,
|
||||
"Key": key,
|
||||
# "ContentType": content_type,
|
||||
}
|
||||
|
||||
# Add SHA256 checksum if provided (for LFS files)
|
||||
# S3 will verify the checksum automatically
|
||||
if checksum_sha256:
|
||||
params["ChecksumSHA256"] = checksum_sha256
|
||||
|
||||
# Generate presigned PUT URL
|
||||
url = s3.generate_presigned_url(
|
||||
"put_object",
|
||||
Params={
|
||||
"Bucket": bucket,
|
||||
"Key": key,
|
||||
# "ContentType": content_type,
|
||||
},
|
||||
Params=params,
|
||||
ExpiresIn=expires_in,
|
||||
HttpMethod="PUT",
|
||||
)
|
||||
@@ -129,13 +139,19 @@ def generate_upload_presigned_url(
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ"
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": content_type,
|
||||
}
|
||||
|
||||
# If checksum is required, client must send it
|
||||
if checksum_sha256:
|
||||
headers["x-amz-checksum-sha256"] = checksum_sha256
|
||||
|
||||
return {
|
||||
"url": url.replace(cfg.s3.endpoint, cfg.s3.public_endpoint),
|
||||
"expires_at": expires_at,
|
||||
"method": "PUT",
|
||||
"headers": {
|
||||
"Content-Type": content_type,
|
||||
},
|
||||
"headers": headers,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user