mirror of
https://github.com/KohakuBlueleaf/KohakuHub.git
synced 2026-03-11 17:34:08 -05:00
140 lines
4.1 KiB
Python
140 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate test files for upload testing.
|
|
|
|
Creates 1000 files of 1.5MB each in the large_file_test folder.
|
|
Useful for testing bulk uploads, LFS threshold behavior, and performance.
|
|
"""
|
|
|
|
import os
|
|
import random
|
|
import string
|
|
from pathlib import Path
|
|
|
|
|
|
def generate_random_text(size_bytes):
|
|
"""Generate random text content of specified size.
|
|
|
|
Args:
|
|
size_bytes: Target file size in bytes
|
|
|
|
Returns:
|
|
String of random text approximately the specified size
|
|
"""
|
|
# Use a mix of text and numbers for more realistic content
|
|
chars = string.ascii_letters + string.digits + " \n"
|
|
|
|
# Generate in chunks to avoid memory issues
|
|
chunk_size = 1024 * 100 # 100KB chunks
|
|
chunks = []
|
|
|
|
remaining = size_bytes
|
|
while remaining > 0:
|
|
current_chunk_size = min(chunk_size, remaining)
|
|
chunk = "".join(random.choices(chars, k=current_chunk_size))
|
|
chunks.append(chunk)
|
|
remaining -= current_chunk_size
|
|
|
|
return "".join(chunks)
|
|
|
|
|
|
def generate_structured_content(file_number, size_bytes):
|
|
"""Generate structured content with headers and repeated text.
|
|
|
|
This creates more realistic file content with headers, sections, etc.
|
|
|
|
Args:
|
|
file_number: File number for unique header
|
|
size_bytes: Target file size in bytes
|
|
|
|
Returns:
|
|
String of structured content
|
|
"""
|
|
header = f"""# Test File {file_number:04d}
|
|
|
|
This is a generated test file for upload testing.
|
|
File size: {size_bytes / (1024*1024):.2f} MB
|
|
|
|
## Content Section
|
|
|
|
"""
|
|
|
|
# Calculate how much content we need to fill
|
|
remaining = size_bytes - len(header.encode("utf-8"))
|
|
|
|
# Generate repeated paragraph content
|
|
paragraph = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor
|
|
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
|
|
exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure
|
|
dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
|
|
|
"""
|
|
|
|
# Calculate how many paragraphs we need
|
|
paragraph_bytes = len(paragraph.encode("utf-8"))
|
|
num_paragraphs = remaining // paragraph_bytes + 1
|
|
|
|
content = header + (paragraph * num_paragraphs)
|
|
|
|
# Trim to exact size
|
|
content_bytes = content.encode("utf-8")[:size_bytes]
|
|
return content_bytes.decode("utf-8", errors="ignore")
|
|
|
|
|
|
def main():
|
|
# Configuration
|
|
output_dir = Path("large_file_test")
|
|
num_files = 1000
|
|
file_size_mb = 1.5
|
|
file_size_bytes = int(file_size_mb * 1024 * 1024)
|
|
|
|
# Create output directory
|
|
output_dir.mkdir(exist_ok=True)
|
|
print(f"Creating {num_files} files of {file_size_mb}MB each...")
|
|
print(f"Output directory: {output_dir.absolute()}")
|
|
print(f"Total size: {num_files * file_size_mb / 1024:.2f} GB")
|
|
print()
|
|
|
|
# Generate files
|
|
for i in range(1, num_files + 1):
|
|
filename = f"test_file_{i:04d}.txt"
|
|
filepath = output_dir / filename
|
|
|
|
# Generate content (use structured content for better testing)
|
|
# Change to generate_random_text(file_size_bytes) for random content
|
|
content = generate_structured_content(i, file_size_bytes)
|
|
|
|
# Write file
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
|
|
# Progress indicator
|
|
if i % 50 == 0:
|
|
progress = (i / num_files) * 100
|
|
print(f"Progress: {i}/{num_files} files ({progress:.1f}%)")
|
|
|
|
print()
|
|
print(f"✓ Successfully created {num_files} files")
|
|
print(f"✓ Location: {output_dir.absolute()}")
|
|
|
|
# Calculate actual size
|
|
total_size = sum(f.stat().st_size for f in output_dir.glob("*.txt"))
|
|
print(f"✓ Total size: {total_size / (1024**3):.2f} GB")
|
|
print()
|
|
print("You can now test with:")
|
|
print(" - Upload via web UI")
|
|
print(" - Upload via huggingface_hub CLI")
|
|
print(" - Upload via kohub-cli")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\n\nOperation cancelled by user")
|
|
except Exception as e:
|
|
print(f"\n\nError: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|