mirror of
https://github.com/KohakuBlueleaf/KohakuHub.git
synced 2026-03-11 17:34:08 -05:00
307 lines
9.2 KiB
Python
307 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate test repository with nested folders and mixed LFS/non-LFS files."""
|
|
|
|
import hashlib
|
|
import os
|
|
import random
|
|
import string
|
|
from pathlib import Path
|
|
|
|
|
|
def random_content(size: int) -> bytes:
|
|
"""Generate random binary content.
|
|
|
|
Args:
|
|
size: Size in bytes
|
|
|
|
Returns:
|
|
Random bytes
|
|
"""
|
|
return os.urandom(size)
|
|
|
|
|
|
def random_text_content(size: int) -> bytes:
|
|
"""Generate random text content.
|
|
|
|
Args:
|
|
size: Size in bytes
|
|
|
|
Returns:
|
|
Random text as bytes
|
|
"""
|
|
chars = string.ascii_letters + string.digits + " \n"
|
|
content = "".join(random.choices(chars, k=size))
|
|
return content.encode("utf-8")
|
|
|
|
|
|
def create_file(path: Path, size: int, binary: bool = False):
|
|
"""Create a file with random content.
|
|
|
|
Args:
|
|
path: File path
|
|
size: File size in bytes
|
|
binary: If True, use binary content; otherwise text
|
|
"""
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if binary:
|
|
content = random_content(size)
|
|
else:
|
|
content = random_text_content(size)
|
|
|
|
path.write_bytes(content)
|
|
|
|
# Compute hashes for verification
|
|
sha256 = hashlib.sha256(content).hexdigest()
|
|
sha1 = hashlib.sha1(content).hexdigest()
|
|
|
|
print(
|
|
f" {str(path):<60} {size:>10} bytes sha256:{sha256[:8]} {'[LFS]' if size >= 1_000_000 else ''}"
|
|
)
|
|
|
|
return sha256, size
|
|
|
|
|
|
def generate_test_repo(base_path: str = "test_folder"):
|
|
"""Generate test repository with nested structure.
|
|
|
|
Structure:
|
|
test_folder/
|
|
README.md (small text)
|
|
.gitattributes (LFS config)
|
|
config/
|
|
settings.json (small JSON)
|
|
large_config.yaml (LFS - 2MB)
|
|
models/
|
|
small_model.txt (small text)
|
|
large_model.bin (LFS - 10MB)
|
|
checkpoints/
|
|
checkpoint_1.safetensors (LFS - 5MB)
|
|
checkpoint_2.safetensors (LFS - 5MB)
|
|
metadata.json (small)
|
|
data/
|
|
train/
|
|
samples/
|
|
image_001.png (LFS - 1.5MB)
|
|
image_002.png (LFS - 1.5MB)
|
|
labels.txt (small)
|
|
dataset.csv (medium - 500KB)
|
|
test/
|
|
results.json (small)
|
|
docs/
|
|
guide.md (small)
|
|
images/
|
|
diagram.png (LFS - 2MB)
|
|
screenshot.jpg (LFS - 1MB)
|
|
scripts/
|
|
train.py (small)
|
|
evaluate.py (small)
|
|
"""
|
|
base = Path(base_path)
|
|
|
|
# Clean up if exists
|
|
if base.exists():
|
|
import shutil
|
|
|
|
shutil.rmtree(base)
|
|
|
|
print(f"\n{'='*100}")
|
|
print(f"Generating test repository: {base_path}")
|
|
print(f"LFS threshold: 1,000,000 bytes (1 MB)")
|
|
print(f"{'='*100}\n")
|
|
|
|
files_created = []
|
|
|
|
# Root level files
|
|
print("Root level:")
|
|
files_created.append(create_file(base / "README.md", 5_000, binary=False))
|
|
files_created.append(
|
|
create_file(base / ".gitattributes", 200, binary=False)
|
|
) # Will overwrite with proper content
|
|
|
|
# config/
|
|
print("\nconfig/:")
|
|
files_created.append(
|
|
create_file(base / "config" / "settings.json", 1_500, binary=False)
|
|
)
|
|
files_created.append(
|
|
create_file(base / "config" / "large_config.yaml", 2_000_000, binary=False)
|
|
) # LFS
|
|
|
|
# models/
|
|
print("\nmodels/:")
|
|
files_created.append(
|
|
create_file(base / "models" / "small_model.txt", 50_000, binary=False)
|
|
)
|
|
files_created.append(
|
|
create_file(base / "models" / "large_model.bin", 10_000_000, binary=True)
|
|
) # LFS
|
|
|
|
# models/checkpoints/
|
|
print("\nmodels/checkpoints/:")
|
|
files_created.append(
|
|
create_file(
|
|
base / "models" / "checkpoints" / "checkpoint_1.safetensors",
|
|
5_000_000,
|
|
binary=True,
|
|
)
|
|
) # LFS
|
|
files_created.append(
|
|
create_file(
|
|
base / "models" / "checkpoints" / "checkpoint_2.safetensors",
|
|
5_500_000,
|
|
binary=True,
|
|
)
|
|
) # LFS
|
|
files_created.append(
|
|
create_file(
|
|
base / "models" / "checkpoints" / "metadata.json", 800, binary=False
|
|
)
|
|
)
|
|
|
|
# data/train/samples/
|
|
print("\ndata/train/samples/:")
|
|
files_created.append(
|
|
create_file(
|
|
base / "data" / "train" / "samples" / "image_001.png",
|
|
1_500_000,
|
|
binary=True,
|
|
)
|
|
) # LFS
|
|
files_created.append(
|
|
create_file(
|
|
base / "data" / "train" / "samples" / "image_002.png",
|
|
1_600_000,
|
|
binary=True,
|
|
)
|
|
) # LFS
|
|
files_created.append(
|
|
create_file(
|
|
base / "data" / "train" / "samples" / "image_003.png",
|
|
1_400_000,
|
|
binary=True,
|
|
)
|
|
) # LFS
|
|
files_created.append(
|
|
create_file(
|
|
base / "data" / "train" / "samples" / "labels.txt", 3_000, binary=False
|
|
)
|
|
)
|
|
|
|
# data/train/
|
|
print("\ndata/train/:")
|
|
files_created.append(
|
|
create_file(base / "data" / "train" / "dataset.csv", 500_000, binary=False)
|
|
)
|
|
|
|
# data/test/
|
|
print("\ndata/test/:")
|
|
files_created.append(
|
|
create_file(base / "data" / "test" / "results.json", 2_500, binary=False)
|
|
)
|
|
|
|
# docs/
|
|
print("\ndocs/:")
|
|
files_created.append(create_file(base / "docs" / "guide.md", 8_000, binary=False))
|
|
|
|
# docs/images/
|
|
print("\ndocs/images/:")
|
|
files_created.append(
|
|
create_file(base / "docs" / "images" / "diagram.png", 2_000_000, binary=True)
|
|
) # LFS
|
|
files_created.append(
|
|
create_file(base / "docs" / "images" / "screenshot.jpg", 1_200_000, binary=True)
|
|
) # LFS
|
|
|
|
# scripts/
|
|
print("\nscripts/:")
|
|
files_created.append(
|
|
create_file(base / "scripts" / "train.py", 4_000, binary=False)
|
|
)
|
|
files_created.append(
|
|
create_file(base / "scripts" / "evaluate.py", 3_500, binary=False)
|
|
)
|
|
|
|
# Generate proper .gitattributes
|
|
print("\nGenerating .gitattributes...")
|
|
lfs_files = []
|
|
regular_files = []
|
|
|
|
for sha256, size in files_created:
|
|
if size >= 1_000_000:
|
|
lfs_files.append((sha256, size))
|
|
else:
|
|
regular_files.append((sha256, size))
|
|
|
|
gitattributes_lines = ["# Git LFS tracking\n"]
|
|
gitattributes_lines.append("*.bin filter=lfs diff=lfs merge=lfs -text\n")
|
|
gitattributes_lines.append("*.safetensors filter=lfs diff=lfs merge=lfs -text\n")
|
|
gitattributes_lines.append("*.png filter=lfs diff=lfs merge=lfs -text\n")
|
|
gitattributes_lines.append("*.jpg filter=lfs diff=lfs merge=lfs -text\n")
|
|
gitattributes_lines.append(
|
|
"config/large_config.yaml filter=lfs diff=lfs merge=lfs -text\n"
|
|
)
|
|
|
|
(base / ".gitattributes").write_text("".join(gitattributes_lines))
|
|
|
|
# Summary
|
|
print(f"\n{'='*100}")
|
|
print("Summary:")
|
|
print(f" Total files: {len(files_created)}")
|
|
print(f" LFS files (>=1MB): {len(lfs_files)}")
|
|
print(f" Regular files (<1MB): {len(regular_files)}")
|
|
|
|
total_size = sum(size for _, size in files_created)
|
|
lfs_size = sum(size for _, size in lfs_files)
|
|
regular_size = sum(size for _, size in regular_files)
|
|
|
|
print(f"\n Total size: {total_size / 1000 / 1000:.2f} MB")
|
|
print(f" LFS size: {lfs_size / 1000 / 1000:.2f} MB")
|
|
print(f" Regular size: {regular_size / 1000:.2f} KB")
|
|
print(f"\n Directory structure:")
|
|
print(f" - Root: 2 files")
|
|
print(f" - config/: 2 files")
|
|
print(f" - models/: 2 files")
|
|
print(f" - models/checkpoints/: 3 files")
|
|
print(f" - data/train/: 1 file")
|
|
print(f" - data/train/samples/: 4 files")
|
|
print(f" - data/test/: 1 file")
|
|
print(f" - docs/: 1 file")
|
|
print(f" - docs/images/: 2 files")
|
|
print(f" - scripts/: 2 files")
|
|
|
|
print(f"\n Test repository created at: {base.absolute()}")
|
|
print(f"{'='*100}\n")
|
|
|
|
# Create file listing
|
|
file_list_path = base / "FILE_LIST.txt"
|
|
with open(file_list_path, "w") as f:
|
|
f.write("# File listing for test repository\n\n")
|
|
for root, dirs, files in os.walk(base):
|
|
level = root.replace(str(base), "").count(os.sep)
|
|
indent = " " * 2 * level
|
|
f.write(f"{indent}{os.path.basename(root)}/\n")
|
|
sub_indent = " " * 2 * (level + 1)
|
|
for file in files:
|
|
if file == "FILE_LIST.txt":
|
|
continue
|
|
file_path = Path(root) / file
|
|
size = file_path.stat().st_size
|
|
lfs_marker = "[LFS]" if size >= 1_000_000 else ""
|
|
f.write(f"{sub_indent}{file} ({size} bytes) {lfs_marker}\n")
|
|
|
|
print(f"File listing saved to: {file_list_path}\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
base_path = sys.argv[1] if len(sys.argv) > 1 else "test_folder"
|
|
generate_test_repo(base_path)
|
|
print("✅ Test repository generated successfully!")
|
|
print("\nNext steps:")
|
|
print(" 1. Upload to KohakuHub via API/CLI")
|
|
print(" 2. Test git clone")
|
|
print(" 3. Verify folder structure matches")
|
|
print(" 4. Test git lfs pull for large files")
|