mirror of
https://github.com/KohakuBlueleaf/KohakuHub.git
synced 2026-03-09 07:12:07 -05:00
347 lines
10 KiB
Python
347 lines
10 KiB
Python
"""Show S3 storage usage.
|
|
|
|
This script displays storage statistics without making any changes.
|
|
Useful for monitoring storage usage on limited plans (e.g., CloudFlare R2 free tier).
|
|
|
|
Usage:
|
|
python scripts/show_s3_usage.py \\
|
|
--endpoint https://s3.amazonaws.com \\
|
|
--access-key YOUR_KEY \\
|
|
--secret-key YOUR_SECRET \\
|
|
--bucket my-bucket
|
|
|
|
python scripts/show_s3_usage.py \\
|
|
--endpoint https://s3.amazonaws.com \\
|
|
--access-key YOUR_KEY \\
|
|
--secret-key YOUR_SECRET \\
|
|
--bucket my-bucket \\
|
|
--detailed
|
|
|
|
Requirements:
|
|
- boto3 and rich packages
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
import boto3
|
|
from botocore.exceptions import ClientError
|
|
from rich.console import Console
|
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
from rich.table import Table
|
|
from rich.tree import Tree
|
|
|
|
console = Console()
|
|
|
|
|
|
def get_s3_client(endpoint, access_key, secret_key, region="us-east-1"):
|
|
"""Create S3 client with provided credentials.
|
|
|
|
Args:
|
|
endpoint: S3 endpoint URL
|
|
access_key: S3 access key
|
|
secret_key: S3 secret key
|
|
region: S3 region (default: us-east-1)
|
|
|
|
Returns:
|
|
boto3 S3 client
|
|
"""
|
|
return boto3.client(
|
|
"s3",
|
|
endpoint_url=endpoint,
|
|
aws_access_key_id=access_key,
|
|
aws_secret_access_key=secret_key,
|
|
region_name=region,
|
|
config=boto3.session.Config(
|
|
signature_version="s3v4",
|
|
s3={"addressing_style": "path"},
|
|
),
|
|
)
|
|
|
|
|
|
def format_size(bytes_size):
|
|
"""Format bytes to human-readable size."""
|
|
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
|
if bytes_size < 1024.0:
|
|
return f"{bytes_size:.2f} {unit}"
|
|
bytes_size /= 1024.0
|
|
return f"{bytes_size:.2f} PB"
|
|
|
|
|
|
def analyze_storage(s3_client, bucket):
|
|
"""Analyze storage usage by prefix.
|
|
|
|
Returns:
|
|
dict: Storage statistics by prefix
|
|
"""
|
|
stats = defaultdict(lambda: {"count": 0, "size": 0, "objects": []})
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
transient=True,
|
|
) as progress:
|
|
progress.add_task("Analyzing storage...", total=None)
|
|
|
|
paginator = s3_client.get_paginator("list_objects_v2")
|
|
try:
|
|
for page in paginator.paginate(Bucket=bucket):
|
|
if "Contents" not in page:
|
|
continue
|
|
|
|
for obj in page["Contents"]:
|
|
key = obj["Key"]
|
|
size = obj["Size"]
|
|
|
|
# Categorize by prefix
|
|
if key.startswith("lfs/"):
|
|
prefix = "lfs"
|
|
elif key.startswith("hf-model-"):
|
|
prefix = "models"
|
|
elif key.startswith("hf-dataset-"):
|
|
prefix = "datasets"
|
|
elif key.startswith("hf-space-"):
|
|
prefix = "spaces"
|
|
else:
|
|
prefix = "other"
|
|
|
|
stats[prefix]["count"] += 1
|
|
stats[prefix]["size"] += size
|
|
stats[prefix]["objects"].append({"key": key, "size": size})
|
|
|
|
# Update total
|
|
stats["total"]["count"] += 1
|
|
stats["total"]["size"] += size
|
|
|
|
except ClientError as e:
|
|
if e.response["Error"]["Code"] == "NoSuchBucket":
|
|
console.print(f"[red]Error: Bucket '{bucket}' does not exist[/red]")
|
|
return None
|
|
raise
|
|
|
|
return stats
|
|
|
|
|
|
def display_summary(bucket, stats, detailed=False):
|
|
"""Display storage summary table."""
|
|
table = Table(
|
|
title=f"S3 Storage Usage: {bucket}", show_header=True, header_style="bold cyan"
|
|
)
|
|
table.add_column("Category", style="yellow")
|
|
table.add_column("Objects", justify="right", style="magenta")
|
|
table.add_column("Total Size", justify="right", style="green")
|
|
table.add_column("Percentage", justify="right", style="cyan")
|
|
|
|
total_size = stats["total"]["size"]
|
|
|
|
# Sort by size (descending)
|
|
categories = [
|
|
("lfs", "LFS Files (>5MB)"),
|
|
("models", "Model Repositories"),
|
|
("datasets", "Dataset Repositories"),
|
|
("spaces", "Space Repositories"),
|
|
("other", "Other"),
|
|
]
|
|
|
|
for prefix, label in categories:
|
|
if prefix in stats and stats[prefix]["count"] > 0:
|
|
count = stats[prefix]["count"]
|
|
size = stats[prefix]["size"]
|
|
percentage = (size / total_size * 100) if total_size > 0 else 0
|
|
|
|
table.add_row(
|
|
label,
|
|
f"{count:,}",
|
|
format_size(size),
|
|
f"{percentage:.1f}%",
|
|
)
|
|
|
|
# Add total row
|
|
table.add_row(
|
|
"[bold]TOTAL[/bold]",
|
|
f"[bold]{stats['total']['count']:,}[/bold]",
|
|
f"[bold]{format_size(stats['total']['size'])}[/bold]",
|
|
"[bold]100.0%[/bold]",
|
|
style="bold blue",
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
# Detailed view
|
|
if detailed and stats["total"]["count"] > 0:
|
|
console.print()
|
|
tree = Tree(f"[bold cyan]Storage Breakdown[/bold cyan]")
|
|
|
|
for prefix, label in categories:
|
|
if prefix in stats and stats[prefix]["count"] > 0:
|
|
branch = tree.add(
|
|
f"[yellow]{label}[/yellow] - {format_size(stats[prefix]['size'])}"
|
|
)
|
|
|
|
# Show top 10 largest objects in this category
|
|
objects = sorted(
|
|
stats[prefix]["objects"], key=lambda x: x["size"], reverse=True
|
|
)[:10]
|
|
for obj in objects:
|
|
branch.add(
|
|
f"{obj['key']} - [green]{format_size(obj['size'])}[/green]"
|
|
)
|
|
|
|
if len(stats[prefix]["objects"]) > 10:
|
|
branch.add(
|
|
f"[dim]... and {len(stats[prefix]['objects']) - 10} more[/dim]"
|
|
)
|
|
|
|
console.print(tree)
|
|
|
|
|
|
def display_quota_warning(total_size, quota_gb=10):
|
|
"""Display warning if approaching quota limit."""
|
|
quota_bytes = quota_gb * 1024**3
|
|
percentage = (total_size / quota_bytes * 100) if quota_bytes > 0 else 0
|
|
|
|
console.print()
|
|
if percentage >= 90:
|
|
console.print(
|
|
f"[bold red]⚠ WARNING: Using {percentage:.1f}% of {quota_gb}GB quota![/bold red]"
|
|
)
|
|
console.print(
|
|
f"[red]Consider running: python scripts/clear_s3_storage.py --prefix lfs/[/red]"
|
|
)
|
|
elif percentage >= 75:
|
|
console.print(
|
|
f"[bold yellow]⚠ Approaching quota limit: {percentage:.1f}% of {quota_gb}GB used[/bold yellow]"
|
|
)
|
|
else:
|
|
console.print(
|
|
f"[green]✓ Storage usage: {percentage:.1f}% of {quota_gb}GB quota[/green]"
|
|
)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(description="Show S3 storage usage")
|
|
|
|
# S3 connection arguments
|
|
parser.add_argument(
|
|
"--endpoint",
|
|
default=os.environ.get("S3_ENDPOINT"),
|
|
help="S3 endpoint URL (or set S3_ENDPOINT env var)",
|
|
)
|
|
parser.add_argument(
|
|
"--access-key",
|
|
default=os.environ.get("S3_ACCESS_KEY"),
|
|
help="S3 access key (or set S3_ACCESS_KEY env var)",
|
|
)
|
|
parser.add_argument(
|
|
"--secret-key",
|
|
default=os.environ.get("S3_SECRET_KEY"),
|
|
help="S3 secret key (or set S3_SECRET_KEY env var)",
|
|
)
|
|
parser.add_argument(
|
|
"--bucket",
|
|
default=os.environ.get("S3_BUCKET"),
|
|
help="S3 bucket name (or set S3_BUCKET env var)",
|
|
)
|
|
parser.add_argument(
|
|
"--region",
|
|
default=os.environ.get("S3_REGION", "us-east-1"),
|
|
help="S3 region (default: us-east-1, or set S3_REGION env var)",
|
|
)
|
|
|
|
# Display options
|
|
parser.add_argument(
|
|
"--detailed",
|
|
action="store_true",
|
|
help="Show detailed breakdown with top objects",
|
|
)
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
parser.add_argument(
|
|
"--quota-gb",
|
|
type=int,
|
|
default=10,
|
|
help="Storage quota in GB for warning calculation (default: 10GB for R2 free tier)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate required arguments
|
|
if not args.endpoint:
|
|
console.print(
|
|
"[red]Error: --endpoint is required (or set S3_ENDPOINT env var)[/red]"
|
|
)
|
|
sys.exit(1)
|
|
if not args.access_key:
|
|
console.print(
|
|
"[red]Error: --access-key is required (or set S3_ACCESS_KEY env var)[/red]"
|
|
)
|
|
sys.exit(1)
|
|
if not args.secret_key:
|
|
console.print(
|
|
"[red]Error: --secret-key is required (or set S3_SECRET_KEY env var)[/red]"
|
|
)
|
|
sys.exit(1)
|
|
if not args.bucket:
|
|
console.print(
|
|
"[red]Error: --bucket is required (or set S3_BUCKET env var)[/red]"
|
|
)
|
|
sys.exit(1)
|
|
|
|
bucket = args.bucket
|
|
|
|
# Create S3 client
|
|
try:
|
|
s3_client = get_s3_client(
|
|
endpoint=args.endpoint,
|
|
access_key=args.access_key,
|
|
secret_key=args.secret_key,
|
|
region=args.region,
|
|
)
|
|
except Exception as e:
|
|
console.print(f"[red]Error connecting to S3: {e}[/red]")
|
|
sys.exit(1)
|
|
|
|
# Analyze storage
|
|
stats = analyze_storage(s3_client, bucket)
|
|
|
|
if stats is None:
|
|
sys.exit(1)
|
|
|
|
if stats["total"]["count"] == 0:
|
|
console.print("[yellow]Bucket is empty.[/yellow]")
|
|
return
|
|
|
|
# Output format
|
|
if args.json:
|
|
# JSON output
|
|
output = {
|
|
"bucket": bucket,
|
|
"total_objects": stats["total"]["count"],
|
|
"total_size_bytes": stats["total"]["size"],
|
|
"total_size_human": format_size(stats["total"]["size"]),
|
|
"categories": {},
|
|
}
|
|
|
|
for prefix in ["lfs", "models", "datasets", "spaces", "other"]:
|
|
if prefix in stats and stats[prefix]["count"] > 0:
|
|
output["categories"][prefix] = {
|
|
"count": stats[prefix]["count"],
|
|
"size_bytes": stats[prefix]["size"],
|
|
"size_human": format_size(stats[prefix]["size"]),
|
|
}
|
|
|
|
print(json.dumps(output, indent=2))
|
|
else:
|
|
# Rich table output
|
|
console.print()
|
|
display_summary(bucket, stats, detailed=args.detailed)
|
|
display_quota_warning(stats["total"]["size"], quota_gb=args.quota_gb)
|
|
console.print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|