refactor(milestones): use stdlib tar filter='data' for safe CIFAR-10 extraction

This commit is contained in:
Vijay Janapa Reddi
2026-02-22 13:40:09 -05:00
parent 3a149a06a6
commit ab697a84f2

View File

@@ -207,23 +207,10 @@ class DatasetManager:
url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
self.download_with_progress(url, data_file)
# Extract
# Extract (filter='data' prevents path traversal; Python 3.12+)
print("📦 Extracting CIFAR-10...")
with tarfile.open(data_file, 'r:gz') as tar:
def is_within_directory(directory, target):
abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)
prefix = os.path.commonprefix([abs_directory, abs_target])
return prefix == abs_directory
def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise Exception("Attempted Path Traversal in Tar File")
tar.extractall(path, members, numeric_owner=numeric_owner)
safe_extract(tar, cifar_dir)
tar.extractall(cifar_dir, filter='data')
print("✅ Extraction complete!")
else:
print(f"✅ CIFAR-10 already downloaded at {cifar_dir}")