""" Input validation utilities for MLSysBook tools. This module provides comprehensive validation functions for common input types including file paths, configuration values, and data structures. """ import re import json import yaml from pathlib import Path from typing import Any, Dict, List, Optional, Union, Callable, Type from urllib.parse import urlparse from .exceptions import ValidationError def validate_file_path( path: Union[str, Path], must_exist: bool = True, must_be_file: bool = True, must_be_readable: bool = True, allowed_extensions: Optional[List[str]] = None ) -> Path: """Validate a file path. Args: path: File path to validate must_exist: Whether the file must exist must_be_file: Whether the path must be a file (not directory) must_be_readable: Whether the file must be readable allowed_extensions: List of allowed file extensions (e.g., ['.qmd', '.md']) Returns: Validated Path object Raises: ValidationError: If validation fails """ if not path: raise ValidationError("File path cannot be empty") path_obj = Path(path).resolve() # Check for path traversal attempts try: path_obj.resolve().relative_to(Path.cwd().resolve()) except ValueError: # Allow absolute paths, but check for suspicious patterns path_str = str(path_obj) if '..' in path_str or path_str.startswith('/'): # Additional validation for absolute paths pass if must_exist and not path_obj.exists(): raise ValidationError(f"File does not exist: {path_obj}") if must_exist and must_be_file and not path_obj.is_file(): raise ValidationError(f"Path is not a file: {path_obj}") if must_exist and must_be_readable: try: with open(path_obj, 'r', encoding='utf-8') as f: f.read(1) # Try to read one character except PermissionError: raise ValidationError(f"File is not readable: {path_obj}") except UnicodeDecodeError: raise ValidationError(f"File is not valid UTF-8: {path_obj}") if allowed_extensions: if path_obj.suffix.lower() not in [ext.lower() for ext in allowed_extensions]: raise ValidationError( f"File extension {path_obj.suffix} not allowed. " f"Allowed extensions: {allowed_extensions}" ) return path_obj def validate_directory_path( path: Union[str, Path], must_exist: bool = True, create_if_missing: bool = False, must_be_writable: bool = False ) -> Path: """Validate a directory path. Args: path: Directory path to validate must_exist: Whether the directory must exist create_if_missing: Whether to create the directory if it doesn't exist must_be_writable: Whether the directory must be writable Returns: Validated Path object Raises: ValidationError: If validation fails """ if not path: raise ValidationError("Directory path cannot be empty") path_obj = Path(path).resolve() if not path_obj.exists(): if create_if_missing: try: path_obj.mkdir(parents=True, exist_ok=True) except Exception as e: raise ValidationError(f"Cannot create directory {path_obj}: {e}") elif must_exist: raise ValidationError(f"Directory does not exist: {path_obj}") if path_obj.exists() and not path_obj.is_dir(): raise ValidationError(f"Path is not a directory: {path_obj}") if must_be_writable and path_obj.exists(): test_file = path_obj / '.write_test' try: test_file.touch() test_file.unlink() except Exception: raise ValidationError(f"Directory is not writable: {path_obj}") return path_obj def validate_url(url: str, allowed_schemes: Optional[List[str]] = None) -> str: """Validate a URL. Args: url: URL to validate allowed_schemes: List of allowed URL schemes (e.g., ['http', 'https']) Returns: Validated URL string Raises: ValidationError: If validation fails """ if not url: raise ValidationError("URL cannot be empty") if not isinstance(url, str): raise ValidationError("URL must be a string") try: parsed = urlparse(url) except Exception as e: raise ValidationError(f"Invalid URL format: {e}") if not parsed.scheme: raise ValidationError("URL must include a scheme (http, https, etc.)") if not parsed.netloc: raise ValidationError("URL must include a network location") if allowed_schemes and parsed.scheme not in allowed_schemes: raise ValidationError( f"URL scheme '{parsed.scheme}' not allowed. " f"Allowed schemes: {allowed_schemes}" ) return url def validate_json_data( data: Any, schema: Optional[Dict[str, Any]] = None, required_keys: Optional[List[str]] = None ) -> Any: """Validate JSON data structure. Args: data: Data to validate schema: Optional JSON schema for validation required_keys: Required keys for dictionary data Returns: Validated data Raises: ValidationError: If validation fails """ if schema: try: import jsonschema jsonschema.validate(data, schema) except ImportError: raise ValidationError("jsonschema package required for schema validation") except jsonschema.ValidationError as e: raise ValidationError(f"JSON schema validation failed: {e.message}") if required_keys and isinstance(data, dict): missing_keys = [key for key in required_keys if key not in data] if missing_keys: raise ValidationError(f"Missing required keys: {missing_keys}") return data def validate_string( value: Any, min_length: Optional[int] = None, max_length: Optional[int] = None, pattern: Optional[str] = None, allowed_values: Optional[List[str]] = None ) -> str: """Validate a string value. Args: value: Value to validate min_length: Minimum string length max_length: Maximum string length pattern: Regex pattern the string must match allowed_values: List of allowed string values Returns: Validated string Raises: ValidationError: If validation fails """ if not isinstance(value, str): raise ValidationError(f"Expected string, got {type(value).__name__}") if min_length is not None and len(value) < min_length: raise ValidationError(f"String too short. Minimum length: {min_length}") if max_length is not None and len(value) > max_length: raise ValidationError(f"String too long. Maximum length: {max_length}") if pattern and not re.match(pattern, value): raise ValidationError(f"String does not match pattern: {pattern}") if allowed_values and value not in allowed_values: raise ValidationError(f"Value '{value}' not in allowed values: {allowed_values}") return value def validate_number( value: Any, min_value: Optional[Union[int, float]] = None, max_value: Optional[Union[int, float]] = None, number_type: Type = float ) -> Union[int, float]: """Validate a numeric value. Args: value: Value to validate min_value: Minimum allowed value max_value: Maximum allowed value number_type: Expected number type (int or float) Returns: Validated number Raises: ValidationError: If validation fails """ try: if number_type == int: numeric_value = int(value) else: numeric_value = float(value) except (ValueError, TypeError): raise ValidationError(f"Cannot convert '{value}' to {number_type.__name__}") if min_value is not None and numeric_value < min_value: raise ValidationError(f"Value {numeric_value} below minimum: {min_value}") if max_value is not None and numeric_value > max_value: raise ValidationError(f"Value {numeric_value} above maximum: {max_value}") return numeric_value def validate_list( value: Any, item_validator: Optional[Callable] = None, min_items: Optional[int] = None, max_items: Optional[int] = None, unique_items: bool = False ) -> List[Any]: """Validate a list value. Args: value: Value to validate item_validator: Function to validate each item min_items: Minimum number of items max_items: Maximum number of items unique_items: Whether items must be unique Returns: Validated list Raises: ValidationError: If validation fails """ if not isinstance(value, list): raise ValidationError(f"Expected list, got {type(value).__name__}") if min_items is not None and len(value) < min_items: raise ValidationError(f"Too few items. Minimum: {min_items}") if max_items is not None and len(value) > max_items: raise ValidationError(f"Too many items. Maximum: {max_items}") if unique_items and len(value) != len(set(value)): raise ValidationError("List items must be unique") if item_validator: validated_items = [] for i, item in enumerate(value): try: validated_items.append(item_validator(item)) except ValidationError as e: raise ValidationError(f"Item {i} validation failed: {e}") return validated_items return value def validate_config_file(file_path: Union[str, Path]) -> Dict[str, Any]: """Validate and load a configuration file. Args: file_path: Path to configuration file Returns: Loaded configuration data Raises: ValidationError: If validation fails """ path_obj = validate_file_path( file_path, allowed_extensions=['.yaml', '.yml', '.json'] ) try: with open(path_obj, 'r', encoding='utf-8') as f: if path_obj.suffix.lower() == '.json': data = json.load(f) else: data = yaml.safe_load(f) except json.JSONDecodeError as e: raise ValidationError(f"Invalid JSON in config file: {e}") except yaml.YAMLError as e: raise ValidationError(f"Invalid YAML in config file: {e}") except Exception as e: raise ValidationError(f"Cannot read config file: {e}") if not isinstance(data, dict): raise ValidationError("Configuration file must contain a dictionary/object") return data class Validator: """Fluent validation interface for complex validation chains.""" def __init__(self, value: Any, name: str = "value") -> None: """Initialize validator with a value. Args: value: Value to validate name: Name of the value for error messages """ self.value = value self.name = name def is_string(self, **kwargs) -> 'Validator': """Validate that value is a string.""" self.value = validate_string(self.value, **kwargs) return self def is_number(self, **kwargs) -> 'Validator': """Validate that value is a number.""" self.value = validate_number(self.value, **kwargs) return self def is_list(self, **kwargs) -> 'Validator': """Validate that value is a list.""" self.value = validate_list(self.value, **kwargs) return self def is_file_path(self, **kwargs) -> 'Validator': """Validate that value is a file path.""" self.value = validate_file_path(self.value, **kwargs) return self def is_directory_path(self, **kwargs) -> 'Validator': """Validate that value is a directory path.""" self.value = validate_directory_path(self.value, **kwargs) return self def is_url(self, **kwargs) -> 'Validator': """Validate that value is a URL.""" self.value = validate_url(self.value, **kwargs) return self def get(self) -> Any: """Get the validated value.""" return self.value def validate(value: Any, name: str = "value") -> Validator: """Create a new validator for fluent validation. Args: value: Value to validate name: Name of the value for error messages Returns: Validator instance """ return Validator(value, name)