From ab697a84f2794898af0612ba4b375ca737ea1533 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Sun, 22 Feb 2026 13:40:09 -0500 Subject: [PATCH] refactor(milestones): use stdlib tar filter='data' for safe CIFAR-10 extraction --- tinytorch/milestones/data_manager.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/tinytorch/milestones/data_manager.py b/tinytorch/milestones/data_manager.py index 4aa71a9fc..22f341b87 100644 --- a/tinytorch/milestones/data_manager.py +++ b/tinytorch/milestones/data_manager.py @@ -207,23 +207,10 @@ class DatasetManager: url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" self.download_with_progress(url, data_file) - # Extract + # Extract (filter='data' prevents path traversal; Python 3.12+) print("📦 Extracting CIFAR-10...") with tarfile.open(data_file, 'r:gz') as tar: - def is_within_directory(directory, target): - abs_directory = os.path.abspath(directory) - abs_target = os.path.abspath(target) - prefix = os.path.commonprefix([abs_directory, abs_target]) - return prefix == abs_directory - - def safe_extract(tar, path=".", members=None, *, numeric_owner=False): - for member in tar.getmembers(): - member_path = os.path.join(path, member.name) - if not is_within_directory(path, member_path): - raise Exception("Attempted Path Traversal in Tar File") - tar.extractall(path, members, numeric_owner=numeric_owner) - - safe_extract(tar, cifar_dir) + tar.extractall(cifar_dir, filter='data') print("✅ Extraction complete!") else: print(f"✅ CIFAR-10 already downloaded at {cifar_dir}")