mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-08 23:03:55 -05:00
refactor(milestones): use stdlib tar filter='data' for safe CIFAR-10 extraction
This commit is contained in:
@@ -207,23 +207,10 @@ class DatasetManager:
|
||||
url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
|
||||
self.download_with_progress(url, data_file)
|
||||
|
||||
# Extract
|
||||
# Extract (filter='data' prevents path traversal; Python 3.12+)
|
||||
print("📦 Extracting CIFAR-10...")
|
||||
with tarfile.open(data_file, 'r:gz') as tar:
|
||||
def is_within_directory(directory, target):
|
||||
abs_directory = os.path.abspath(directory)
|
||||
abs_target = os.path.abspath(target)
|
||||
prefix = os.path.commonprefix([abs_directory, abs_target])
|
||||
return prefix == abs_directory
|
||||
|
||||
def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
|
||||
for member in tar.getmembers():
|
||||
member_path = os.path.join(path, member.name)
|
||||
if not is_within_directory(path, member_path):
|
||||
raise Exception("Attempted Path Traversal in Tar File")
|
||||
tar.extractall(path, members, numeric_owner=numeric_owner)
|
||||
|
||||
safe_extract(tar, cifar_dir)
|
||||
tar.extractall(cifar_dir, filter='data')
|
||||
print("✅ Extraction complete!")
|
||||
else:
|
||||
print(f"✅ CIFAR-10 already downloaded at {cifar_dir}")
|
||||
|
||||
Reference in New Issue
Block a user