"""Show S3 storage usage. This script displays storage statistics without making any changes. Useful for monitoring storage usage on limited plans (e.g., CloudFlare R2 free tier). Usage: python scripts/show_s3_usage.py \\ --endpoint https://s3.amazonaws.com \\ --access-key YOUR_KEY \\ --secret-key YOUR_SECRET \\ --bucket my-bucket python scripts/show_s3_usage.py \\ --endpoint https://s3.amazonaws.com \\ --access-key YOUR_KEY \\ --secret-key YOUR_SECRET \\ --bucket my-bucket \\ --detailed Requirements: - boto3 and rich packages """ import argparse import json import os import sys from collections import defaultdict import boto3 from botocore.exceptions import ClientError from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn from rich.table import Table from rich.tree import Tree console = Console() def get_s3_client(endpoint, access_key, secret_key, region="us-east-1"): """Create S3 client with provided credentials. Args: endpoint: S3 endpoint URL access_key: S3 access key secret_key: S3 secret key region: S3 region (default: us-east-1) Returns: boto3 S3 client """ return boto3.client( "s3", endpoint_url=endpoint, aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region, config=boto3.session.Config( signature_version="s3v4", s3={"addressing_style": "path"}, ), ) def format_size(bytes_size): """Format bytes to human-readable size.""" for unit in ["B", "KB", "MB", "GB", "TB"]: if bytes_size < 1024.0: return f"{bytes_size:.2f} {unit}" bytes_size /= 1024.0 return f"{bytes_size:.2f} PB" def analyze_storage(s3_client, bucket): """Analyze storage usage by prefix. Returns: dict: Storage statistics by prefix """ stats = defaultdict(lambda: {"count": 0, "size": 0, "objects": []}) with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True, ) as progress: progress.add_task("Analyzing storage...", total=None) paginator = s3_client.get_paginator("list_objects_v2") try: for page in paginator.paginate(Bucket=bucket): if "Contents" not in page: continue for obj in page["Contents"]: key = obj["Key"] size = obj["Size"] # Categorize by prefix if key.startswith("lfs/"): prefix = "lfs" elif key.startswith("hf-model-"): prefix = "models" elif key.startswith("hf-dataset-"): prefix = "datasets" elif key.startswith("hf-space-"): prefix = "spaces" else: prefix = "other" stats[prefix]["count"] += 1 stats[prefix]["size"] += size stats[prefix]["objects"].append({"key": key, "size": size}) # Update total stats["total"]["count"] += 1 stats["total"]["size"] += size except ClientError as e: if e.response["Error"]["Code"] == "NoSuchBucket": console.print(f"[red]Error: Bucket '{bucket}' does not exist[/red]") return None raise return stats def display_summary(bucket, stats, detailed=False): """Display storage summary table.""" table = Table( title=f"S3 Storage Usage: {bucket}", show_header=True, header_style="bold cyan" ) table.add_column("Category", style="yellow") table.add_column("Objects", justify="right", style="magenta") table.add_column("Total Size", justify="right", style="green") table.add_column("Percentage", justify="right", style="cyan") total_size = stats["total"]["size"] # Sort by size (descending) categories = [ ("lfs", "LFS Files (>5MB)"), ("models", "Model Repositories"), ("datasets", "Dataset Repositories"), ("spaces", "Space Repositories"), ("other", "Other"), ] for prefix, label in categories: if prefix in stats and stats[prefix]["count"] > 0: count = stats[prefix]["count"] size = stats[prefix]["size"] percentage = (size / total_size * 100) if total_size > 0 else 0 table.add_row( label, f"{count:,}", format_size(size), f"{percentage:.1f}%", ) # Add total row table.add_row( "[bold]TOTAL[/bold]", f"[bold]{stats['total']['count']:,}[/bold]", f"[bold]{format_size(stats['total']['size'])}[/bold]", "[bold]100.0%[/bold]", style="bold blue", ) console.print(table) # Detailed view if detailed and stats["total"]["count"] > 0: console.print() tree = Tree(f"[bold cyan]Storage Breakdown[/bold cyan]") for prefix, label in categories: if prefix in stats and stats[prefix]["count"] > 0: branch = tree.add( f"[yellow]{label}[/yellow] - {format_size(stats[prefix]['size'])}" ) # Show top 10 largest objects in this category objects = sorted( stats[prefix]["objects"], key=lambda x: x["size"], reverse=True )[:10] for obj in objects: branch.add( f"{obj['key']} - [green]{format_size(obj['size'])}[/green]" ) if len(stats[prefix]["objects"]) > 10: branch.add( f"[dim]... and {len(stats[prefix]['objects']) - 10} more[/dim]" ) console.print(tree) def display_quota_warning(total_size, quota_gb=10): """Display warning if approaching quota limit.""" quota_bytes = quota_gb * 1000**3 percentage = (total_size / quota_bytes * 100) if quota_bytes > 0 else 0 console.print() if percentage >= 90: console.print( f"[bold red]⚠ WARNING: Using {percentage:.1f}% of {quota_gb}GB quota![/bold red]" ) console.print( f"[red]Consider running: python scripts/clear_s3_storage.py --prefix lfs/[/red]" ) elif percentage >= 75: console.print( f"[bold yellow]⚠ Approaching quota limit: {percentage:.1f}% of {quota_gb}GB used[/bold yellow]" ) else: console.print( f"[green]✓ Storage usage: {percentage:.1f}% of {quota_gb}GB quota[/green]" ) def main(): """Main entry point.""" parser = argparse.ArgumentParser(description="Show S3 storage usage") # S3 connection arguments parser.add_argument( "--endpoint", default=os.environ.get("S3_ENDPOINT"), help="S3 endpoint URL (or set S3_ENDPOINT env var)", ) parser.add_argument( "--access-key", default=os.environ.get("S3_ACCESS_KEY"), help="S3 access key (or set S3_ACCESS_KEY env var)", ) parser.add_argument( "--secret-key", default=os.environ.get("S3_SECRET_KEY"), help="S3 secret key (or set S3_SECRET_KEY env var)", ) parser.add_argument( "--bucket", default=os.environ.get("S3_BUCKET"), help="S3 bucket name (or set S3_BUCKET env var)", ) parser.add_argument( "--region", default=os.environ.get("S3_REGION", "us-east-1"), help="S3 region (default: us-east-1, or set S3_REGION env var)", ) # Display options parser.add_argument( "--detailed", action="store_true", help="Show detailed breakdown with top objects", ) parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument( "--quota-gb", type=int, default=10, help="Storage quota in GB for warning calculation (default: 10GB for R2 free tier)", ) args = parser.parse_args() # Validate required arguments if not args.endpoint: console.print( "[red]Error: --endpoint is required (or set S3_ENDPOINT env var)[/red]" ) sys.exit(1) if not args.access_key: console.print( "[red]Error: --access-key is required (or set S3_ACCESS_KEY env var)[/red]" ) sys.exit(1) if not args.secret_key: console.print( "[red]Error: --secret-key is required (or set S3_SECRET_KEY env var)[/red]" ) sys.exit(1) if not args.bucket: console.print( "[red]Error: --bucket is required (or set S3_BUCKET env var)[/red]" ) sys.exit(1) bucket = args.bucket # Create S3 client try: s3_client = get_s3_client( endpoint=args.endpoint, access_key=args.access_key, secret_key=args.secret_key, region=args.region, ) except Exception as e: console.print(f"[red]Error connecting to S3: {e}[/red]") sys.exit(1) # Analyze storage stats = analyze_storage(s3_client, bucket) if stats is None: sys.exit(1) if stats["total"]["count"] == 0: console.print("[yellow]Bucket is empty.[/yellow]") return # Output format if args.json: # JSON output output = { "bucket": bucket, "total_objects": stats["total"]["count"], "total_size_bytes": stats["total"]["size"], "total_size_human": format_size(stats["total"]["size"]), "categories": {}, } for prefix in ["lfs", "models", "datasets", "spaces", "other"]: if prefix in stats and stats[prefix]["count"] > 0: output["categories"][prefix] = { "count": stats[prefix]["count"], "size_bytes": stats[prefix]["size"], "size_human": format_size(stats[prefix]["size"]), } print(json.dumps(output, indent=2)) else: # Rich table output console.print() display_summary(bucket, stats, detailed=args.detailed) display_quota_warning(stats["total"]["size"], quota_gb=args.quota_gb) console.print() if __name__ == "__main__": main()