Merge pull request #933 from harvard-edge/fix/lowercase-filenames

feat(ci): add workflow to fix file casing
2026-05-06 17:49:07 -05:00 · 2025-08-12 16:06:38 -04:00
parent fd630887ed db61ccc711
commit db326c04f0
6 changed files with 255 additions and 22 deletions
--- a/.github/workflows/fix_casing.yml
+++ b/.github/workflows/fix_casing.yml
@@ -0,0 +1,41 @@
+name: Fix File Casing
+
+on:
+  workflow_dispatch:
+
+jobs:
+  fix_casing:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      - name: Generate renaming script
+        id: generate_script
+        run: |
+          python3 tools/scripts/maintenance/generate_casing_fix_script.py
+
+      - name: Execute renaming script
+        run: |
+          if [ -f fix_casing.sh ]; then
+            bash fix_casing.sh
+          else
+            echo "No casing issues found."
+          fi
+      
+      - name: Commit and push changes
+        run: |
+          git config --global user.name 'github-actions[bot]'
+          git config --global user.email 'github-actions[bot]@users.noreply.github.com'
+          git add .
+          if ! git diff --staged --quiet; then
+            git commit -m "fix(images): Correct file casing for image files"
+            git push
+          else
+            echo "No changes to commit."
+          fi
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,6 +53,7 @@ repos:
        name: "Collapse extra blank lines"
        entry: python tools/scripts/content/format_blank_lines.py
        language: python
+        additional_dependencies: []
        pass_filenames: true
        files: ^quarto/contents/.*\.qmd$

@@ -60,13 +61,6 @@ repos:
  # PHASE 2: BASIC VALIDATORS (Structure and syntax)
  # =============================================================================

-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
-    hooks:
-      - id: check-yaml
-        name: "Validate _quarto.yml"
-        files: ^_quarto\.yml$
-
  - repo: local
    hooks:
      # --- Project Structure Check ---
@@ -97,15 +91,6 @@ repos:
          )$
        description: "Validate all YAML files with custom config"

-      # --- GitHub Workflow Validation ---
-      - id: validate-github-workflows
-        name: "Validate GitHub workflow syntax"
-        entry: yamllint
-        language: system
-        args: [--config-file=.yamllint]
-        files: ^\.github/workflows/.*\.(yml|yaml)$
-        description: "Validate GitHub workflow YAML syntax and structure"
-
  # =============================================================================
  # PHASE 3: CONTENT VALIDATORS (After formatting is complete)
  # =============================================================================
@@ -126,14 +111,16 @@ repos:
        name: "Check for unreferenced labels"
        entry: python ./tools/scripts/content/check_unreferenced_labels.py ./quarto/contents/core
        language: python
+        additional_dependencies: []
        pass_filenames: false
        files: ''

      - id: check-duplicate-labels
        name: "Check for duplicate labels"
-        entry: python3 tools/scripts/content/check_duplicate_labels.py
+        entry: python tools/scripts/content/check_duplicate_labels.py
        args: ['-d', 'quarto/contents/', '--figures', '--tables', '--listings', '--quiet', '--strict']
        language: python
+        additional_dependencies: []
        pass_filenames: false
        files: ^quarto/contents/.*\.qmd$
        description: "Ensure all figure, table, and listing labels are unique across the book"
@@ -190,8 +177,9 @@ repos:

      - id: validate-image-references
        name: "Check that all image references exist on disk"
-        entry: python3 tools/scripts/validate_image_references.py -d quarto/contents/ --quiet
+        entry: python tools/scripts/validate_image_references.py -d quarto/contents/ --quiet
        language: python
+        additional_dependencies: []
        pass_filenames: false
        files: ^quarto/contents/.*\.qmd$
        description: "Ensure all referenced images exist on disk"
@@ -205,8 +193,9 @@ repos:
      # --- Auto-cleanup with Book Binder ---
      - id: auto-cleanup-artifacts
        name: "Auto-cleanup build artifacts (Book Binder)"
-        entry: ./binder clean
-        language: system
+        entry: python tools/scripts/maintenance/cleanup_build_artifacts.py
+        language: python
+        additional_dependencies: [rich]
        pass_filenames: false
        files: ''
        stages: [pre-commit]
--- a/quarto/contents/core/data_engineering/data_engineering.qmd
+++ b/quarto/contents/core/data_engineering/data_engineering.qmd
@@ -490,12 +490,12 @@ Masking involves altering or obfuscating sensitive values so that they cannot be

 Generalization reduces the precision or granularity of data to decrease the likelihood of re-identification. Instead of revealing an exact date of birth or address, the data is aggregated into broader categories (e.g., age ranges, zip code prefixes). For example, a user's exact age of 37 might be generalized to an age range of 30-39, while their exact address might be bucketed into a city level granularity. This technique clearly reduces the risk of identifying an individual by sharing data in aggregated form; however, we might consequently lose analytical prediction. Furthermore, if granularity is not chosen correctly, individuals may still be able to be identified under certain conditions.

-Pseudonymization is the process of replacing direct identifiers (like names, Social Security numbers, or email addresses) with artificial identifiers, or “pseudonyms." These pseudonyms must not reveal, or be easily traceable to, the original data subject. This is commonly used in health records or in any situation where datasets need personal identities removed, but maintain unique entries. This approach allow maintaining individual-level data for analysis (since records can be traced through pseudonyms), while reducing the risk of direct identification. However, if the "key" linking the pseudonym to the real identifier is compromised, re-identification becomes possible.
+Pseudonymization is the process of replacing direct identifiers (like names, Social Security numbers, or email addresses) with artificial identifiers, or "pseudonyms." These pseudonyms must not reveal, or be easily traceable to, the original data subject. This is commonly used in health records or in any situation where datasets need personal identities removed, but maintain unique entries. This approach allow maintaining individual-level data for analysis (since records can be traced through pseudonyms), while reducing the risk of direct identification. However, if the "key" linking the pseudonym to the real identifier is compromised, re-identification becomes possible.

 $k$-anonymity ensures that each record in a dataset is indistinguishable from at least
 $𝑘−1$ other records. This is achieved by suppressing or generalizing quasi-identifiers, or attributes that, in combination, could be used to re-identify an individual (e.g., zip code, age, gender). For example, if  $k=5$, every record in the dataset must share the same combination of quasi-identifiers with at least four other records. Thus, an attacker cannot pinpoint a single individual simply by looking at these attributes. This approach provides a formal privacy guarantee that helps reduce chances of individual re-identification. However, it is extremely high touch and may require a significant level of data distortion and does not protect against things like [homogeneity or background knowledge attacks](https://en.wikipedia.org/wiki/K-anonymity#Attacks).

-Differential privacy (DP) adds carefully [calibrated “noise" or randomized data perturbations](https://digitalprivacy.ieee.org/publications/topics/what-is-differential-privacy#:~:text=At%20its%20roots%2C%20differential%20privacy,a%20result%20of%20providing%20data.) to query results or datasets. The goal is to ensure that the inclusion or exclusion of any single individual’s data does not significantly affect the output, thereby concealing their presence. Introduced noise is controlled by the $\epsilon$ parameter in $\epsilon$-Differential Privacy, balancing data utility and privacy guarantees. The clear advantages this approach provides are strong mathematical guarantees of privacy, and DP is widely used in academic and industrial settings (e.g., large-scale data analysis). However, the added noise can affect data accuracy and subsequent model performance; proper parameter tuning is crucial to ensure both privacy and usefulness.
+Differential privacy (DP) adds carefully [calibrated "noise" or randomized data perturbations](https://digitalprivacy.ieee.org/publications/topics/what-is-differential-privacy#:~:text=At%20its%20roots%2C%20differential%20privacy,a%20result%20of%20providing%20data.) to query results or datasets. The goal is to ensure that the inclusion or exclusion of any single individual's data does not significantly affect the output, thereby concealing their presence. Introduced noise is controlled by the $\epsilon$ parameter in $\epsilon$-Differential Privacy, balancing data utility and privacy guarantees. The clear advantages this approach provides are strong mathematical guarantees of privacy, and DP is widely used in academic and industrial settings (e.g., large-scale data analysis). However, the added noise can affect data accuracy and subsequent model performance; proper parameter tuning is crucial to ensure both privacy and usefulness.

 In summary, effective data anonymization is a balancing act between privacy and utility. Techniques such as masking, generalization, pseudonymization, k-anonymity, and differential privacy each target different aspects of re-identification risk. By carefully selecting and combining these methods, organizations can responsibly derive value from sensitive datasets while respecting the privacy rights and expectations of the individuals represented within them.

--- a/tools/scripts/maintenance/cleanup_build_artifacts.py
+++ b/tools/scripts/maintenance/cleanup_build_artifacts.py
@@ -0,0 +1,98 @@
+import argparse
+import shutil
+from pathlib import Path
+
+from rich.console import Console
+
+
+def clean_artifacts(book_dir_str: str, dry_run: bool = False):
+    """
+    Clean build artifacts and restore configs.
+    """
+    console = Console()
+    book_dir = Path(book_dir_str)
+
+    console.print("[bold blue]🧹 Build Artifact Cleanup[/bold blue]")
+
+    # Restore configs
+    for config_ext in ["_quarto-html.yml", "_quarto-pdf.yml"]:
+        config_file = book_dir / "config" / config_ext
+        backup_file = config_file.with_suffix(f"{config_file.suffix}.fast-build-backup")
+
+        if backup_file.exists():
+            if not dry_run:
+                shutil.copy(backup_file, config_file)
+                backup_file.unlink()
+            console.print(f"[green]  ✅ Restored: {config_file.name}[/green]")
+        else:
+            console.print(f"[dim]  - Already clean: {config_file.name}[/dim]")
+
+    # Define artifacts to clean
+    artifacts_to_clean = [
+        (book_dir / "_build", "Build directory (all formats)"),
+        (book_dir / "index_files", "Book index files"),
+        (book_dir / ".quarto", "Quarto cache (book)"),
+    ]
+
+    # Clean Quarto-generated figure directories
+    contents_core = book_dir / "contents" / "core"
+    if contents_core.exists():
+        for chapter_dir in contents_core.glob("*/"):
+            if chapter_dir.is_dir():
+                for files_dir in chapter_dir.glob("*_files"):
+                    if files_dir.is_dir():
+                        figure_html_dir = files_dir / "figure-html"
+                        if figure_html_dir.exists():
+                            artifacts_to_clean.append(
+                                (
+                                    figure_html_dir,
+                                    f"Quarto figure artifacts ({chapter_dir.name})",
+                                )
+                            )
+
+                figure_html_direct = chapter_dir / "figure-html"
+                if figure_html_direct.exists():
+                    artifacts_to_clean.append(
+                        (
+                            figure_html_direct,
+                            f"Quarto figure artifacts ({chapter_dir.name})",
+                        )
+                    )
+
+    cleaned_count = 0
+    for artifact_path, description in artifacts_to_clean:
+        if artifact_path.exists():
+            console.print(
+                f"[yellow]  🗑️  Removing: {artifact_path.name} ({description})[/yellow]"
+            )
+            if not dry_run:
+                if artifact_path.is_dir():
+                    shutil.rmtree(artifact_path)
+                else:
+                    artifact_path.unlink()
+            cleaned_count += 1
+
+    if cleaned_count > 0:
+        console.print(f"[green]  ✅ Cleaned {cleaned_count} items successfully[/green]")
+    else:
+        console.print("[green]  ✅ No artifacts to clean[/green]")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Clean build artifacts from the Quarto project."
+    )
+    parser.add_argument(
+        "--book-dir",
+        type=str,
+        default="quarto",
+        help="Path to the book directory (default: 'quarto').",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be cleaned without actually deleting anything.",
+    )
+    args = parser.parse_args()
+
+    clean_artifacts(book_dir_str=args.book_dir, dry_run=args.dry_run)
--- a/tools/scripts/maintenance/generate_casing_fix_script.py
+++ b/tools/scripts/maintenance/generate_casing_fix_script.py
@@ -0,0 +1,42 @@
+import os
+import subprocess
+
+def get_git_files():
+    """Get a list of all files tracked by Git."""
+    result = subprocess.run(['git', 'ls-files'], capture_output=True, text=True)
+    return result.stdout.splitlines()
+
+def generate_rename_script(output_script_path):
+    """
+    Generates a shell script to rename files with uppercase characters to lowercase.
+    """
+    git_files = get_git_files()
+    image_extensions = {".png", ".jpg", ".jpeg", ".gif"}
+    commands = []
+
+    for file_path in git_files:
+        directory, filename = os.path.split(file_path)
+        
+        # Check if the filename has any uppercase characters and is an image
+        if any(char.isupper() for char in filename) and os.path.splitext(filename)[1].lower() in image_extensions:
+            lowercase_filename = filename.lower()
+            if filename != lowercase_filename:
+                new_path = os.path.join(directory, lowercase_filename)
+                # Use a temporary name to handle systems that are case-insensitive
+                temp_path = os.path.join(directory, f"temp_{lowercase_filename}")
+                
+                commands.append(f'git mv -f "{file_path}" "{temp_path}"')
+                commands.append(f'git mv -f "{temp_path}" "{new_path}"')
+
+    if commands:
+        with open(output_script_path, "w") as f:
+            f.write("#!/bin/bash\n")
+            f.write("# This script renames image files to force Git to recognize case changes.\n\n")
+            f.write("\n".join(commands))
+            f.write("\n")
+        print(f"Generated rename script with {len(commands)//2} files to rename at: {output_script_path}")
+    else:
+        print("No image files with uppercase characters found to rename.")
+
+if __name__ == "__main__":
+    generate_rename_script("fix_casing.sh")
--- a/tools/scripts/preflight.py
+++ b/tools/scripts/preflight.py
@@ -0,0 +1,63 @@
+import subprocess
+import sys
+
+from rich.console import Console
+
+
+def run_pre_commit():
+    """
+    Run all pre-commit hooks and report the status.
+    """
+    console = Console()
+    console.print("[bold blue]🚀 Running pre-commit checks...[/bold blue]")
+
+    try:
+        process = subprocess.run(
+            ["pre-commit", "run", "--all-files"],
+            capture_output=True,
+            text=True,
+            check=False,  # Don't raise exception on non-zero exit
+        )
+
+        if process.returncode == 0:
+            console.print("[bold green]✅ Pre-commit checks passed successfully![/bold green]")
+            return True
+        else:
+            console.print("[bold red]❌ Pre-commit checks failed.[/bold red]")
+            console.print("\n[yellow]Output:[/yellow]")
+            console.print(process.stdout)
+            console.print(process.stderr)
+            return False
+    except FileNotFoundError:
+        console.print("[bold red]Error: 'pre-commit' command not found.[/bold red]")
+        console.print("Please ensure pre-commit is installed and in your PATH.")
+        return False
+    except Exception as e:
+        console.print(f"[bold red]An unexpected error occurred: {e}[/bold red]")
+        return False
+
+
+def main():
+    """
+    Main function to run all pre-flight checks.
+    """
+    console = Console()
+    console.print("[bold magenta]✈️  Starting pre-flight checks...[/bold magenta]\n")
+
+    pre_commit_ok = run_pre_commit()
+
+    if not pre_commit_ok:
+        sys.exit(1)
+
+    # Future checks can be added here
+    # For example:
+    # slow_tests_ok = run_slow_tests()
+    # if not slow_tests_ok:
+    #     sys.exit(1)
+
+    console.print("\n[bold green]🎉 All pre-flight checks passed![/bold green]")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()