#!/usr/bin/env python3 """Generate test repository with nested folders and mixed LFS/non-LFS files.""" import hashlib import os import random import string from pathlib import Path def random_content(size: int) -> bytes: """Generate random binary content. Args: size: Size in bytes Returns: Random bytes """ return os.urandom(size) def random_text_content(size: int) -> bytes: """Generate random text content. Args: size: Size in bytes Returns: Random text as bytes """ chars = string.ascii_letters + string.digits + " \n" content = "".join(random.choices(chars, k=size)) return content.encode("utf-8") def create_file(path: Path, size: int, binary: bool = False): """Create a file with random content. Args: path: File path size: File size in bytes binary: If True, use binary content; otherwise text """ path.parent.mkdir(parents=True, exist_ok=True) if binary: content = random_content(size) else: content = random_text_content(size) path.write_bytes(content) # Compute hashes for verification sha256 = hashlib.sha256(content).hexdigest() sha1 = hashlib.sha1(content).hexdigest() print( f" {str(path):<60} {size:>10} bytes sha256:{sha256[:8]} {'[LFS]' if size >= 1_000_000 else ''}" ) return sha256, size def generate_test_repo(base_path: str = "test_folder"): """Generate test repository with nested structure. Structure: test_folder/ README.md (small text) .gitattributes (LFS config) config/ settings.json (small JSON) large_config.yaml (LFS - 2MB) models/ small_model.txt (small text) large_model.bin (LFS - 10MB) checkpoints/ checkpoint_1.safetensors (LFS - 5MB) checkpoint_2.safetensors (LFS - 5MB) metadata.json (small) data/ train/ samples/ image_001.png (LFS - 1.5MB) image_002.png (LFS - 1.5MB) labels.txt (small) dataset.csv (medium - 500KB) test/ results.json (small) docs/ guide.md (small) images/ diagram.png (LFS - 2MB) screenshot.jpg (LFS - 1MB) scripts/ train.py (small) evaluate.py (small) """ base = Path(base_path) # Clean up if exists if base.exists(): import shutil shutil.rmtree(base) print(f"\n{'='*100}") print(f"Generating test repository: {base_path}") print(f"LFS threshold: 1,000,000 bytes (1 MB)") print(f"{'='*100}\n") files_created = [] # Root level files print("Root level:") files_created.append(create_file(base / "README.md", 5_000, binary=False)) files_created.append( create_file(base / ".gitattributes", 200, binary=False) ) # Will overwrite with proper content # config/ print("\nconfig/:") files_created.append( create_file(base / "config" / "settings.json", 1_500, binary=False) ) files_created.append( create_file(base / "config" / "large_config.yaml", 2_000_000, binary=False) ) # LFS # models/ print("\nmodels/:") files_created.append( create_file(base / "models" / "small_model.txt", 50_000, binary=False) ) files_created.append( create_file(base / "models" / "large_model.bin", 10_000_000, binary=True) ) # LFS # models/checkpoints/ print("\nmodels/checkpoints/:") files_created.append( create_file( base / "models" / "checkpoints" / "checkpoint_1.safetensors", 5_000_000, binary=True, ) ) # LFS files_created.append( create_file( base / "models" / "checkpoints" / "checkpoint_2.safetensors", 5_500_000, binary=True, ) ) # LFS files_created.append( create_file( base / "models" / "checkpoints" / "metadata.json", 800, binary=False ) ) # data/train/samples/ print("\ndata/train/samples/:") files_created.append( create_file( base / "data" / "train" / "samples" / "image_001.png", 1_500_000, binary=True, ) ) # LFS files_created.append( create_file( base / "data" / "train" / "samples" / "image_002.png", 1_600_000, binary=True, ) ) # LFS files_created.append( create_file( base / "data" / "train" / "samples" / "image_003.png", 1_400_000, binary=True, ) ) # LFS files_created.append( create_file( base / "data" / "train" / "samples" / "labels.txt", 3_000, binary=False ) ) # data/train/ print("\ndata/train/:") files_created.append( create_file(base / "data" / "train" / "dataset.csv", 500_000, binary=False) ) # data/test/ print("\ndata/test/:") files_created.append( create_file(base / "data" / "test" / "results.json", 2_500, binary=False) ) # docs/ print("\ndocs/:") files_created.append(create_file(base / "docs" / "guide.md", 8_000, binary=False)) # docs/images/ print("\ndocs/images/:") files_created.append( create_file(base / "docs" / "images" / "diagram.png", 2_000_000, binary=True) ) # LFS files_created.append( create_file(base / "docs" / "images" / "screenshot.jpg", 1_200_000, binary=True) ) # LFS # scripts/ print("\nscripts/:") files_created.append( create_file(base / "scripts" / "train.py", 4_000, binary=False) ) files_created.append( create_file(base / "scripts" / "evaluate.py", 3_500, binary=False) ) # Generate proper .gitattributes print("\nGenerating .gitattributes...") lfs_files = [] regular_files = [] for sha256, size in files_created: if size >= 1_000_000: lfs_files.append((sha256, size)) else: regular_files.append((sha256, size)) gitattributes_lines = ["# Git LFS tracking\n"] gitattributes_lines.append("*.bin filter=lfs diff=lfs merge=lfs -text\n") gitattributes_lines.append("*.safetensors filter=lfs diff=lfs merge=lfs -text\n") gitattributes_lines.append("*.png filter=lfs diff=lfs merge=lfs -text\n") gitattributes_lines.append("*.jpg filter=lfs diff=lfs merge=lfs -text\n") gitattributes_lines.append( "config/large_config.yaml filter=lfs diff=lfs merge=lfs -text\n" ) (base / ".gitattributes").write_text("".join(gitattributes_lines)) # Summary print(f"\n{'='*100}") print("Summary:") print(f" Total files: {len(files_created)}") print(f" LFS files (>=1MB): {len(lfs_files)}") print(f" Regular files (<1MB): {len(regular_files)}") total_size = sum(size for _, size in files_created) lfs_size = sum(size for _, size in lfs_files) regular_size = sum(size for _, size in regular_files) print(f"\n Total size: {total_size / 1024 / 1024:.2f} MB") print(f" LFS size: {lfs_size / 1024 / 1024:.2f} MB") print(f" Regular size: {regular_size / 1024:.2f} KB") print(f"\n Directory structure:") print(f" - Root: 2 files") print(f" - config/: 2 files") print(f" - models/: 2 files") print(f" - models/checkpoints/: 3 files") print(f" - data/train/: 1 file") print(f" - data/train/samples/: 4 files") print(f" - data/test/: 1 file") print(f" - docs/: 1 file") print(f" - docs/images/: 2 files") print(f" - scripts/: 2 files") print(f"\n Test repository created at: {base.absolute()}") print(f"{'='*100}\n") # Create file listing file_list_path = base / "FILE_LIST.txt" with open(file_list_path, "w") as f: f.write("# File listing for test repository\n\n") for root, dirs, files in os.walk(base): level = root.replace(str(base), "").count(os.sep) indent = " " * 2 * level f.write(f"{indent}{os.path.basename(root)}/\n") sub_indent = " " * 2 * (level + 1) for file in files: if file == "FILE_LIST.txt": continue file_path = Path(root) / file size = file_path.stat().st_size lfs_marker = "[LFS]" if size >= 1_000_000 else "" f.write(f"{sub_indent}{file} ({size} bytes) {lfs_marker}\n") print(f"File listing saved to: {file_list_path}\n") if __name__ == "__main__": import sys base_path = sys.argv[1] if len(sys.argv) > 1 else "test_folder" generate_test_repo(base_path) print("✅ Test repository generated successfully!") print("\nNext steps:") print(" 1. Upload to KohakuHub via API/CLI") print(" 2. Test git clone") print(" 3. Verify folder structure matches") print(" 4. Test git lfs pull for large files")