diff --git a/flowsint-api/app/api/routes/types.py b/flowsint-api/app/api/routes/types.py index 1ddb7db..5db1c6b 100644 --- a/flowsint-api/app/api/routes/types.py +++ b/flowsint-api/app/api/routes/types.py @@ -282,6 +282,8 @@ def extract_input_schema( "fields": [ resolve_field(prop, details=info, schema=schema) for prop, info in details.get("properties", {}).items() + # exclude label from properties to fill + if prop != "label" ], } diff --git a/flowsint-api/app/utils.py b/flowsint-api/app/utils.py index 1a0bd16..c9b1260 100644 --- a/flowsint-api/app/utils.py +++ b/flowsint-api/app/utils.py @@ -8,7 +8,6 @@ import re import ssl import socket from typing import Dict, Any, List, Type -from pydantic import BaseModel import inspect from typing import Any, Dict, Type from pydantic import BaseModel, TypeAdapter diff --git a/flowsint-app/src/components/graphs/details-panel/details-panel.tsx b/flowsint-app/src/components/graphs/details-panel/details-panel.tsx index b4dc910..b9c464b 100644 --- a/flowsint-app/src/components/graphs/details-panel/details-panel.tsx +++ b/flowsint-app/src/components/graphs/details-panel/details-panel.tsx @@ -113,7 +113,7 @@ function KeyValueDisplay({ data, className }: KeyValueDisplayProps) { {data && Object.entries(data) .filter( - ([key]) => !['sketch_id', 'caption', 'size', 'color', 'description', 'x', 'y'].includes(key) + ([key]) => !['id', 'sketch_id', 'caption', 'size', 'color', 'description', 'x', 'y'].includes(key) ) .map(([key, value], index) => { let val: string | null = null diff --git a/flowsint-core/src/flowsint_core/core/registry.py b/flowsint-core/src/flowsint_core/core/registry.py index 3198932..564169c 100644 --- a/flowsint-core/src/flowsint_core/core/registry.py +++ b/flowsint-core/src/flowsint_core/core/registry.py @@ -51,7 +51,6 @@ from flowsint_transforms.email.to_gravatar import EmailToGravatarTransform from flowsint_transforms.email.to_leaks import EmailToBreachesTransform # Phone-related transforms -from flowsint_transforms.phone.to_leaks import PhoneToBreachesTransform # Individual-related transforms from flowsint_transforms.individual.to_org import IndividualToOrgTransform @@ -192,9 +191,6 @@ TransformRegistry.register(EmailToGravatarTransform) TransformRegistry.register(EmailToBreachesTransform) TransformRegistry.register(EmailToDomainsTransform) -# Phone-related transforms -TransformRegistry.register(PhoneToBreachesTransform) - # Individual-related transforms TransformRegistry.register(IndividualToOrgTransform) TransformRegistry.register(IndividualToDomainsTransform) diff --git a/flowsint-core/src/flowsint_core/core/transform_base.py b/flowsint-core/src/flowsint_core/core/transform_base.py index 1dd3ebf..b09384c 100644 --- a/flowsint-core/src/flowsint_core/core/transform_base.py +++ b/flowsint-core/src/flowsint_core/core/transform_base.py @@ -132,7 +132,7 @@ class Transform(ABC): self._graph_service = create_graph_service( sketch_id=self.sketch_id, neo4j_connection=neo4j_conn, - enable_batching=True + enable_batching=True, ) # Params is filled synchronously by the constructor. This params is generally constructed of @@ -355,35 +355,49 @@ class Transform(ABC): def preprocess(self, values: List) -> List: """ Generic preprocess that validates and converts input using InputType. - Automatically handles str, dict, and object inputs, filtering invalids silently. - - Subclasses can override this method for custom preprocessing logic. + Automatically handles dicts, objects, and strings (using the model's primary field). + Invalid items are skipped silently. """ - # If InputType is not defined, return as-is (backward compatibility) if self.InputType is NotImplemented: return values from typing import get_args - # Extract base type from List[Type] type_args = get_args(self.InputType) if not type_args: - # If no type args, return as-is return values base_type = type_args[0] - - # Use TypeAdapter for validation and conversion adapter = TypeAdapter(base_type) + + # Trouver le champ primaire marqué par Field(..., primary=True) + primary_field = None + if issubclass(base_type, BaseModel): + for name, field in base_type.model_fields.items(): + if field.json_schema_extra and field.json_schema_extra.get("primary"): + primary_field = name + break + if primary_field is None: + # fallback : premier champ requis ou premier champ disponible + for name, field in base_type.model_fields.items(): + if field.is_required(): + primary_field = name + break + if primary_field is None: + primary_field = next(iter(base_type.model_fields.keys())) + cleaned = [] for item in values: try: - # TypeAdapter handles str, dict, and object automatically + # Si item est une string, transformer en dict {primary_field: string} + if isinstance(item, str) and primary_field: + item = {primary_field: item} + validated = adapter.validate_python(item) cleaned.append(validated) except Exception: - # Skip invalid items silently + # Ignore les items invalides continue return cleaned @@ -423,7 +437,7 @@ class Transform(ABC): return [] def create_node( - self, node_type: str, key_prop: str, key_value: str, **properties + self, node_type_or_obj, key_prop=None, key_value=None, **properties ) -> None: """ Create a single Neo4j node. @@ -434,24 +448,53 @@ class Transform(ABC): The following properties are automatically added to every node: - type: Lowercase version of node_type - sketch_id: Current sketch ID from transform context - - label: Defaults to key_value if not provided + - label: Automatically computed by FlowsintType, or defaults to key_value if not provided - created_at: ISO 8601 UTC timestamp (only on creation, not updates) + Best Practice - Use Pydantic object directly: + The simplest way is to pass a Pydantic object directly. The node type, + key property, and key value are automatically inferred: + + ```python + # Best: pass the Pydantic object directly + self.create_node(ip) + + # Also good if you need to override properties + self.create_node(domain, type="subdomain") + ``` + Args: - node_type: Node label (e.g., "domain", "ip") - key_prop: Property name used as unique identifier - key_value: Value of the key property - **properties: Additional node properties + node_type_or_obj: Either a Pydantic object (FlowsintType), or node label string (e.g., "domain", "ip") + key_prop: Property name used as unique identifier (optional if passing Pydantic object) + key_value: Value of the key property (optional if passing Pydantic object) + **properties: Additional node properties or property overrides Note: Uses MERGE semantics - if a node with the same (key_prop, sketch_id) exists, it will be updated. The created_at field is only set on creation. """ + # Check if first argument is a Pydantic object + if isinstance(node_type_or_obj, BaseModel): + obj = node_type_or_obj + + # Infer node_type from class name (e.g., Ip -> "ip", Domain -> "domain") + node_type = obj.__class__.__name__.lower() + + # Get the primary field and its value + primary_field = self._get_primary_field(obj) + key_prop = primary_field + key_value = getattr(obj, primary_field) + + # Merge object properties with any overrides + obj_properties = obj.__dict__.copy() + obj_properties.update(properties) + properties = obj_properties + else: + # Legacy signature: node_type_or_obj is the node_type string + node_type = node_type_or_obj + self._graph_service.create_node( - node_type=node_type, - key_prop=key_prop, - key_value=key_value, - **properties + node_type=node_type, key_prop=key_prop, key_value=key_value, **properties ) def _serialize_properties(self, properties: dict) -> dict: @@ -468,17 +511,18 @@ class Transform(ABC): Dictionary of serialized properties """ from .graph_serializer import GraphSerializer + return GraphSerializer.serialize_properties(properties) def create_relationship( self, - from_type: str, - from_key: str, - from_value: str, - to_type: str, - to_key: str, - to_value: str, - rel_type: str, + from_type_or_obj, + from_key_or_to_obj, + from_value_or_rel_type=None, + to_type=None, + to_key=None, + to_value=None, + rel_type=None, ) -> None: """ Create a relationship between two nodes. @@ -486,24 +530,95 @@ class Transform(ABC): This method now uses the GraphService for improved performance and better separation of concerns. + Best Practice - Use Pydantic objects directly: + The simplest way is to pass two Pydantic objects and the relationship type: + + ```python + # Best: pass Pydantic objects directly + self.create_relationship(individual, domain, "HAS_DOMAIN") + self.create_relationship(email, breach, "FOUND_IN_BREACH") + ``` + + Legacy Usage: + You can still use the explicit signature for backward compatibility: + + ```python + # Legacy: explicit signature + self.create_relationship( + "individual", "full_name", individual.full_name, + "domain", "domain", domain_name, + "HAS_DOMAIN" + ) + ``` + Args: - from_type: Source node label - from_key: Source node key property - from_value: Source node key value - to_type: Target node label - to_key: Target node key property - to_value: Target node key value - rel_type: Relationship type + from_type_or_obj: Either a Pydantic object (source node) or source node label string + from_key_or_to_obj: Either a Pydantic object (target node) or source node key property + from_value_or_rel_type: Either relationship type string (if using objects) or source node key value + to_type: Target node label (only for legacy signature) + to_key: Target node key property (only for legacy signature) + to_value: Target node key value (only for legacy signature) + rel_type: Relationship type (only for legacy signature) """ - self._graph_service.create_relationship( - from_type=from_type, - from_key=from_key, - from_value=from_value, - to_type=to_type, - to_key=to_key, - to_value=to_value, - rel_type=rel_type - ) + # Check if using new signature (Pydantic objects) + if isinstance(from_type_or_obj, BaseModel) and isinstance(from_key_or_to_obj, BaseModel): + from_obj = from_type_or_obj + to_obj = from_key_or_to_obj + relationship_type = from_value_or_rel_type + + # Extract from_node info + from_node_type = from_obj.__class__.__name__.lower() + from_primary_field = self._get_primary_field(from_obj) + from_key_value = getattr(from_obj, from_primary_field) + + # Extract to_node info + to_node_type = to_obj.__class__.__name__.lower() + to_primary_field = self._get_primary_field(to_obj) + to_key_value = getattr(to_obj, to_primary_field) + + self._graph_service.create_relationship( + from_type=from_node_type, + from_key=from_primary_field, + from_value=from_key_value, + to_type=to_node_type, + to_key=to_primary_field, + to_value=to_key_value, + rel_type=relationship_type, + ) + else: + # Legacy signature + self._graph_service.create_relationship( + from_type=from_type_or_obj, + from_key=from_key_or_to_obj, + from_value=from_value_or_rel_type, + to_type=to_type, + to_key=to_key, + to_value=to_value, + rel_type=rel_type, + ) + + def _get_primary_field(self, obj: BaseModel) -> str: + """Helper method to get the primary field of a Pydantic object.""" + # Access model_fields from the class, not the instance + model_fields = obj.__class__.model_fields + + # Find the primary field (marked with json_schema_extra={"primary": True}) + primary_field = None + for field_name, field_info in model_fields.items(): + if field_info.json_schema_extra and field_info.json_schema_extra.get("primary"): + primary_field = field_name + break + + # Fallback: use first required field or first field + if primary_field is None: + for field_name, field_info in model_fields.items(): + if field_info.is_required(): + primary_field = field_name + break + if primary_field is None: + primary_field = next(iter(model_fields.keys())) + + return primary_field def log_graph_message(self, message: str) -> None: """ diff --git a/flowsint-core/src/flowsint_core/core/vault.py b/flowsint-core/src/flowsint_core/core/vault.py index 8cc6ef5..34138eb 100644 --- a/flowsint-core/src/flowsint_core/core/vault.py +++ b/flowsint-core/src/flowsint_core/core/vault.py @@ -1,4 +1,3 @@ -from ast import Str import os from typing import Protocol, Optional import uuid diff --git a/flowsint-core/tests/test_transform_base_simplified_api.py b/flowsint-core/tests/test_transform_base_simplified_api.py new file mode 100644 index 0000000..abcecbb --- /dev/null +++ b/flowsint-core/tests/test_transform_base_simplified_api.py @@ -0,0 +1,117 @@ +"""Test simplified API for create_node and create_relationship.""" +import pytest +from flowsint_core.core.transform_base import Transform +from flowsint_types.domain import Domain +from flowsint_types.email import Email +from flowsint_types.individual import Individual +from typing import List + + +class TestTransform(Transform): + """Simple transform for testing.""" + + InputType = List[Domain] + OutputType = List[Domain] + + @classmethod + def name(cls) -> str: + return "test_transform" + + @classmethod + def category(cls) -> str: + return "Test" + + @classmethod + def key(cls) -> str: + return "domain" + + async def scan(self, data: InputType) -> OutputType: + return data + + +def test_create_node_with_pydantic_object(): + """Test that create_node works with Pydantic objects.""" + transform = TestTransform(sketch_id="test", scan_id="test") + + # Create a domain object + domain = Domain(domain="example.com") + + # This should not raise an error + transform.create_node(domain) + + # Verify the helper method works + primary_field = transform._get_primary_field(domain) + assert primary_field == "domain" + + +def test_create_relationship_with_pydantic_objects(): + """Test that create_relationship works with Pydantic objects.""" + transform = TestTransform(sketch_id="test", scan_id="test") + + # Create objects + individual = Individual( + first_name="John", + last_name="Doe", + full_name="John Doe" + ) + domain = Domain(domain="example.com") + + # This should not raise an error + transform.create_relationship(individual, domain, "HAS_DOMAIN") + + +def test_create_node_legacy_signature(): + """Test that legacy create_node signature still works.""" + transform = TestTransform(sketch_id="test", scan_id="test") + + domain = Domain(domain="example.com") + + # Legacy signature should still work + transform.create_node("domain", "domain", "example.com", **domain.__dict__) + + +def test_create_relationship_legacy_signature(): + """Test that legacy create_relationship signature still works.""" + transform = TestTransform(sketch_id="test", scan_id="test") + + # Legacy signature should still work + transform.create_relationship( + "individual", + "full_name", + "John Doe", + "domain", + "domain", + "example.com", + "HAS_DOMAIN" + ) + + +def test_get_primary_field(): + """Test the _get_primary_field helper method.""" + transform = TestTransform(sketch_id="test", scan_id="test") + + # Test with Domain (has primary field marked) + domain = Domain(domain="example.com") + assert transform._get_primary_field(domain) == "domain" + + # Test with Email (has primary field marked) + email = Email(email="test@example.com") + assert transform._get_primary_field(email) == "email" + + # Test with Individual (no primary field marked, falls back to first required field) + individual = Individual( + first_name="John", + last_name="Doe", + full_name="John Doe" + ) + assert transform._get_primary_field(individual) == "first_name" + + +def test_create_node_with_property_override(): + """Test that property overrides work with Pydantic objects.""" + transform = TestTransform(sketch_id="test", scan_id="test") + + domain = Domain(domain="example.com") + + # Should be able to override properties + transform.create_node(domain, type="subdomain") diff --git a/flowsint-core/tests/transforms/base.py b/flowsint-core/tests/transforms/base.py new file mode 100644 index 0000000..470622b --- /dev/null +++ b/flowsint-core/tests/transforms/base.py @@ -0,0 +1,71 @@ +from typing import List +from flowsint_core.core.transform_base import Transform +from flowsint_types import Phone +from flowsint_types.domain import Domain +from flowsint_types.ip import Ip + + +class ResolveTransform(Transform): + InputType = List[Domain] + OutputType = List[Ip] + + @classmethod + def name(cls) -> str: + return "domain_to_ip" + + @classmethod + def category(cls) -> str: + return "Domain" + + @classmethod + def key(cls) -> str: + return "domain" + + async def scan(self, data: InputType) -> OutputType: + return [] + + def postprocess(self, results: OutputType, original_input: InputType) -> OutputType: + return [] + + +# Make types available at module level for easy access +InputType = ResolveTransform.InputType +OutputType = ResolveTransform.OutputType + +transform = ResolveTransform("sketch_123", "scan_123") + + +def test_correct_preprocess(): + inputs = [ + Domain(domain="mydomain.com"), + {"domain": "blog.mydomain2.com"}, + "mydomain3.com", + "notADomaiN", + ] + preprocessed = transform.preprocess(inputs) + assert len(preprocessed) == 3 # 3 valid domains + assert preprocessed[0].domain == "mydomain.com" + assert preprocessed[0].label == "mydomain.com" + assert preprocessed[0].root == True + + assert preprocessed[1].domain == "blog.mydomain2.com" + assert preprocessed[1].label == "blog.mydomain2.com" + assert preprocessed[1].root == False + + assert preprocessed[2].domain == "mydomain3.com" + assert preprocessed[2].label == "mydomain3.com" + assert preprocessed[2].root == True + + +def test_incorrect_preprocess(): + inputs = [ + Phone(number="+33634565423"), + {"name": "JohnDoe"}, + "mydomain.com", + "notADomaiN", + ] + preprocessed = transform.preprocess(inputs) + assert len(preprocessed) == 1 # 1 valid domain + assert preprocessed[0].domain == "mydomain.com" + assert preprocessed[0].root == True + assert preprocessed[0].label == "mydomain.com" diff --git a/flowsint-transforms/src/flowsint_transforms/asn/to_cidrs.py b/flowsint-transforms/src/flowsint_transforms/asn/to_cidrs.py index 251613f..75f9b10 100644 --- a/flowsint-transforms/src/flowsint_transforms/asn/to_cidrs.py +++ b/flowsint-transforms/src/flowsint_transforms/asn/to_cidrs.py @@ -62,26 +62,6 @@ class AsnToCidrsTransform(Transform): def key(cls) -> str: return "number" - def preprocess( - self, data: Union[List[str], List[int], List[dict], InputType] - ) -> InputType: - cleaned: InputType = [] - for item in data: - asn_obj = None - try: - if isinstance(item, (str, int)): - asn_obj = ASN(number=parse_asn(str(item))) - elif isinstance(item, dict) and "number" in item: - asn_obj = ASN(number=parse_asn(str(item["number"]))) - elif isinstance(item, ASN): - asn_obj = item - if asn_obj and is_valid_asn(str(asn_obj.number)): - cleaned.append(asn_obj) - except ValueError: - Logger.warn(self.sketch_id, {"message": f"Invalid ASN format: {item}"}) - continue - return cleaned - async def scan(self, data: InputType) -> OutputType: """Find CIDR from ASN using asnmap.""" cidrs: OutputType = [] @@ -145,33 +125,11 @@ class AsnToCidrsTransform(Transform): if str(cidr.network) == "0.0.0.0/0": continue # Skip default CIDR for unknown ASN if self.neo4j_conn: - self.create_node( - "asn", - "number", - asn.number, - label=f"AS{asn.number}", - caption=f"AS{asn.number}", - type="asn", - ) + self.create_node(asn) - self.create_node( - "cidr", - "network", - str(cidr.network), - label=str(cidr.network), - caption=str(cidr.network), - type="cidr", - ) + self.create_node(cidr) - self.create_relationship( - "asn", - "number", - asn.number, - "cidr", - "network", - str(cidr.network), - "ANNOUNCES", - ) + self.create_relationship(asn, cidr, "ANNOUNCES") self.log_graph_message( f"AS{asn.number} announces CIDR {cidr.network}" @@ -200,15 +158,9 @@ class AsnToCidrsTransform(Transform): type="cidr", ) - self.create_relationship( - "asn", - "number", - asn.number, - "cidr", - "network", - str(cidr.network), - "ANNOUNCES", - ) + asn_obj = ASN(number=asn.number) + cidr_obj = CIDR(network=str(cidr.network)) + self.create_relationship(asn_obj, cidr_obj, "ANNOUNCES") self.log_graph_message( f"AS{asn.number} announces CIDR {cidr.network}" diff --git a/flowsint-transforms/src/flowsint_transforms/cidr/to_ips.py b/flowsint-transforms/src/flowsint_transforms/cidr/to_ips.py index 237701f..bf1599e 100644 --- a/flowsint-transforms/src/flowsint_transforms/cidr/to_ips.py +++ b/flowsint-transforms/src/flowsint_transforms/cidr/to_ips.py @@ -60,24 +60,6 @@ class CidrToIpsTransform(Transform): def category(cls) -> str: return "Cidr" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - cidr_obj = None - try: - if isinstance(item, str): - cidr_obj = CIDR(network=item) - elif isinstance(item, dict) and "network" in item: - cidr_obj = CIDR(network=item["network"]) - elif isinstance(item, CIDR): - cidr_obj = item - if cidr_obj: - cleaned.append(cidr_obj) - except ValueError: - Logger.warn(self.sketch_id, {"message": f"Invalid CIDR format: {item}"}) - continue - return cleaned - async def scan(self, data: InputType) -> OutputType: """Find IP addresses from CIDR using mapcidr.""" ips: OutputType = [] @@ -141,69 +123,25 @@ class CidrToIpsTransform(Transform): for ip in ip_list: if self.neo4j_conn: # Create CIDR node - self.create_node( - "cidr", - "network", - str(cidr.network), - label=str(cidr.network), - caption=str(cidr.network), - type="cidr", - ) + self.create_node(cidr) # Create IP node - self.create_node( - "ip", - "address", - ip.address, - label=ip.address, - caption=ip.address, - type="ip", - ) + self.create_node(ip) # Create relationship - self.create_relationship( - "cidr", - "network", - str(cidr.network), - "ip", - "address", - ip.address, - "CONTAINS", - ) + self.create_relationship(cidr, ip, "CONTAINS") else: # Fallback: original behavior (one-to-one zip) for cidr, ip in zip(original_input, results): if self.neo4j_conn: # Create CIDR node - self.create_node( - "cidr", - "network", - str(cidr.network), - label=str(cidr.network), - caption=str(cidr.network), - type="cidr", - ) - + self.create_node(cidr) # Create IP node - self.create_node( - "ip", - "address", - ip.address, - label=ip.address, - caption=ip.address, - type="ip", - ) - + self.create_node(ip) # Create relationship - self.create_relationship( - "cidr", - "network", - str(cidr.network), - "ip", - "address", - ip.address, - "CONTAINS", - ) + cidr_obj = CIDR(network=str(cidr.network)) + ip_obj = Ip(address=ip.address) + self.create_relationship(cidr_obj, ip_obj, "CONTAINS") self.log_graph_message( f"CIDR {cidr.network} contains IP {ip.address}" diff --git a/flowsint-transforms/src/flowsint_transforms/crypto/to_nfts.py b/flowsint-transforms/src/flowsint_transforms/crypto/to_nfts.py index e969735..4e7459c 100644 --- a/flowsint-transforms/src/flowsint_transforms/crypto/to_nfts.py +++ b/flowsint-transforms/src/flowsint_transforms/crypto/to_nfts.py @@ -73,24 +73,12 @@ class CryptoWalletAddressToNFTs(Transform): def key(cls) -> str: return "address" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - wallet_obj = None - if isinstance(item, str): - wallet_obj = CryptoWallet(address=item) - elif isinstance(item, dict) and "address" in item: - wallet_obj = CryptoWallet(address=item["address"]) - elif isinstance(item, CryptoWallet): - wallet_obj = item - if wallet_obj: - cleaned.append(wallet_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] api_key = self.get_secret("ETHERSCAN_API_KEY", os.getenv("ETHERSCAN_API_KEY")) - api_url = self.get_params().get("ETHERSCAN_API_URL", "https://api.etherscan.io/v2/api") + api_url = self.get_params().get( + "ETHERSCAN_API_URL", "https://api.etherscan.io/v2/api" + ) for d in data: try: nfts = self._get_nfts(d.address, api_key, api_url) @@ -137,40 +125,22 @@ class CryptoWalletAddressToNFTs(Transform): for nfts in results: for nft in nfts: # Create or update wallet node - self.create_node( - "cryptowallet", - "wallet", - nft.wallet.address, - caption=nft.wallet.address, - type="cryptowallet", - ) - + self.create_node(nft.wallet) # Create or update NFT node nft_key = f"{nft.contract_address}_{nft.token_id}" - self.create_node( - "nft", - "nft_id", - nft_key, + self.create_node(nft) + # Create relationship from wallet to NFT + wallet_obj = CryptoWallet(address=nft.wallet.address) + nft_obj = CryptoNFT( + wallet=wallet_obj, contract_address=nft.contract_address, token_id=nft.token_id, collection_name=nft.collection_name, metadata_url=nft.metadata_url, image_url=nft.image_url, name=nft.name, - caption=nft.name, - type="nft", - ) - - # Create relationship from wallet to NFT - self.create_relationship( - "cryptowallet", - "wallet", - nft.wallet.address, - "nft", - "nft_id", - nft_key, - "OWNS", ) + self.create_relationship(wallet_obj, nft_obj, "OWNS") self.log_graph_message( f"Found NFT for {nft.wallet.address}: {nft.contract_address} - {nft.token_id}" ) diff --git a/flowsint-transforms/src/flowsint_transforms/crypto/to_transactions.py b/flowsint-transforms/src/flowsint_transforms/crypto/to_transactions.py index b520b05..5068981 100644 --- a/flowsint-transforms/src/flowsint_transforms/crypto/to_transactions.py +++ b/flowsint-transforms/src/flowsint_transforms/crypto/to_transactions.py @@ -79,20 +79,6 @@ class CryptoWalletAddressToTransactions(Transform): def key(cls) -> str: return "address" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - wallet_obj = None - if isinstance(item, str): - wallet_obj = CryptoWallet(address=item) - elif isinstance(item, dict) and "address" in item: - wallet_obj = CryptoWallet(address=item["address"]) - elif isinstance(item, CryptoWallet): - wallet_obj = item - if wallet_obj: - cleaned.append(wallet_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] api_key = self.get_secret("ETHERSCAN_API_KEY", os.getenv("ETHERSCAN_API_KEY")) @@ -194,21 +180,8 @@ class CryptoWalletAddressToTransactions(Transform): for transactions in results: for tx in transactions: # Create or update both wallet nodes - self.create_node( - "cryptowallet", - "wallet", - tx.source.address, - caption=tx.source.address, - type="cryptowallet", - ) - self.create_node( - "cryptowallet", - "wallet", - tx.target.address, - caption=tx.target.address, - type="cryptowallet", - ) - + self.create_node(tx.source) + self.create_node(tx.target.address) # Create transaction as an edge between wallets (keeping complex query for transaction properties) tx_query = """ MATCH (source:cryptowallet {wallet: $source}) diff --git a/flowsint-transforms/src/flowsint_transforms/domain/to_asn.py b/flowsint-transforms/src/flowsint_transforms/domain/to_asn.py index 83c6917..3463077 100644 --- a/flowsint-transforms/src/flowsint_transforms/domain/to_asn.py +++ b/flowsint-transforms/src/flowsint_transforms/domain/to_asn.py @@ -63,22 +63,6 @@ class DomainToAsnTransform(Transform): def key(cls) -> str: return "domain" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - domain_obj = None - if isinstance(item, str): - if is_valid_domain(item): - domain_obj = Domain(domain=item) - elif isinstance(item, dict) and "domain" in item: - if is_valid_domain(item["domain"]): - domain_obj = Domain(domain=item["domain"]) - elif isinstance(item, Domain): - domain_obj = item - if domain_obj: - cleaned.append(domain_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] asnmap = AsnmapTool() @@ -135,35 +119,12 @@ class DomainToAsnTransform(Transform): if input_data and self.neo4j_conn: for domain, asn in zip(input_data, results): # Create domain node - self.create_node( - "domain", - "domain", - domain.domain, - label=domain.domain, - caption=domain.domain, - type="domain", - ) - + self.create_node(domain) # Create ASN node - self.create_node( - "asn", - "number", - asn.number, - label=f"AS{asn.number}", - caption=f"AS{asn.number}", - type="asn", - ) + self.create_node(asn) # Create relationship - self.create_relationship( - "domain", - "domain", - domain.domain, - "asn", - "number", - asn.number, - "HOSTED_IN", - ) + self.create_relationship(domain, asn, "HOSTED_IN") self.log_graph_message( f"Domain {domain.domain} is hosted in AS{asn.number} ({asn.name})" diff --git a/flowsint-transforms/src/flowsint_transforms/domain/to_history.py b/flowsint-transforms/src/flowsint_transforms/domain/to_history.py index ed3d895..f6d0700 100644 --- a/flowsint-transforms/src/flowsint_transforms/domain/to_history.py +++ b/flowsint-transforms/src/flowsint_transforms/domain/to_history.py @@ -6,6 +6,8 @@ from flowsint_core.core.transform_base import Transform from flowsint_types.domain import Domain from flowsint_types.individual import Individual from flowsint_types.organization import Organization +from flowsint_types.email import Email +from flowsint_types.phone import Phone from flowsint_core.utils import is_valid_domain, is_root_domain from flowsint_types.address import Location from flowsint_core.core.logger import Logger @@ -67,25 +69,6 @@ class DomainToHistoryTransform(Transform): def key(cls) -> str: return "domain" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - domain_obj = None - if isinstance(item, str): - domain_obj = Domain(domain=item, root=is_root_domain(item)) - elif isinstance(item, dict) and "domain" in item: - domain_obj = Domain( - domain=item["domain"], root=is_root_domain(item["domain"]) - ) - elif isinstance(item, Domain): - # If the Domain object already exists, update its root field - domain_obj = Domain( - domain=item.domain, root=is_root_domain(item.domain) - ) - if domain_obj and is_valid_domain(domain_obj.domain): - cleaned.append(domain_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: """Find infos related to domains using whoxy api.""" domains: OutputType = [] @@ -452,25 +435,13 @@ class DomainToHistoryTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating domain node: {domain_name}"}, ) - self.create_node( - "domain", - "domain", - domain_name, - label=domain_name, - caption=domain_name, - type="domain", - ) + domain_obj = Domain(domain=domain_name) + self.create_node(domain_obj) # Create relationship between original domain and found domain - self.create_relationship( - "domain", - "domain", - original_domain_name, - "domain", - "domain", - domain_name, - "HAS_RELATED_DOMAIN", - ) + original_domain_obj = Domain(domain=original_domain_name) + domain_obj_rel = Domain(domain=domain_name) + self.create_relationship(original_domain_obj, domain_obj_rel, "HAS_RELATED_DOMAIN") # Create individual node if not already processed individual_id = ( @@ -484,24 +455,11 @@ class DomainToHistoryTransform(Transform): "message": f"[WHOXY] Creating individual node: {individual.full_name}" }, ) - self.create_node( - "individual", - "full_name", - individual.full_name, - caption=individual.full_name, - type="individual", - ) + self.create_node(individual) # Create relationship between individual and domain - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "domain", - "domain", - domain_name, - f"IS_{contact_type.upper()}_CONTACT", - ) + domain_obj_contact = Domain(domain=domain_name) + self.create_relationship(individual, domain_obj_contact, f"IS_{contact_type.upper()}_CONTACT") # Process email addresses if individual.email_addresses: @@ -513,22 +471,9 @@ class DomainToHistoryTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating email node: {email_str}"}, ) - self.create_node( - "email", - "email", - email_str, - caption=email_str, - type="email", - ) - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "email", - "email", - email_str, - "HAS_EMAIL", - ) + email_node = Email(email=email_str) + self.create_node(email_node) + self.create_relationship(individual, email_node, "HAS_EMAIL") # Process phone numbers if individual.phone_numbers: @@ -540,22 +485,9 @@ class DomainToHistoryTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating phone node: {phone_str}"}, ) - self.create_node( - "phone", - "number", - phone_str, - caption=phone_str, - type="phone", - ) - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "phone", - "number", - phone_str, - "HAS_PHONE", - ) + phone_node = Phone(number=phone_str) + self.create_node(phone_node) + self.create_relationship(individual, phone_node, "HAS_PHONE") # Process physical address from contact data contact_data = individual_info["contact_data"] @@ -572,22 +504,8 @@ class DomainToHistoryTransform(Transform): "message": f"[WHOXY] Creating address node: {address.address}" }, ) - self.create_node( - "location", - "address", - address.address, - caption=f"{address.address}, {address.city}", - type="location", - ) - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "location", - "address", - address.address, - "LIVES_AT", - ) + self.create_node(address) + self.create_relationship(individual, address, "LIVES_AT") self.log_graph_message( f"Processed individual {individual.full_name} ({contact_type}) for domain {domain_name}" @@ -614,25 +532,13 @@ class DomainToHistoryTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating domain node: {domain_name}"}, ) - self.create_node( - "domain", - "domain", - domain_name, - label=domain_name, - caption=domain_name, - type="domain", - ) + domain_obj = Domain(domain=domain_name) + self.create_node(domain_obj) # Create relationship between original domain and found domain - self.create_relationship( - "domain", - "domain", - original_domain_name, - "domain", - "domain", - domain_name, - "HAS_RELATED_DOMAIN", - ) + original_domain_obj3 = Domain(domain=original_domain_name) + domain_obj_rel3 = Domain(domain=domain_name) + self.create_relationship(original_domain_obj3, domain_obj_rel3, "HAS_RELATED_DOMAIN") # Create organization node if not already processed if organization.name not in processed_organizations: @@ -643,24 +549,11 @@ class DomainToHistoryTransform(Transform): "message": f"[WHOXY] Creating organization node: {organization.name}" }, ) - self.create_node( - "organization", - "name", - organization.name, - caption=organization.name, - type="organization", - ) + self.create_node(organization) # Create relationship between organization and domain - self.create_relationship( - "organization", - "name", - organization.name, - "domain", - "domain", - domain_name, - f"IS_{contact_type.upper()}_CONTACT", - ) + domain_obj_org = Domain(domain=domain_name) + self.create_relationship(organization, domain_obj_org, f"IS_{contact_type.upper()}_CONTACT") self.log_graph_message( f"Processed organization {organization.name} ({contact_type}) for domain {domain_name}" diff --git a/flowsint-transforms/src/flowsint_transforms/domain/to_ip.py b/flowsint-transforms/src/flowsint_transforms/domain/to_ip.py index 8ac3eae..55cc885 100644 --- a/flowsint-transforms/src/flowsint_transforms/domain/to_ip.py +++ b/flowsint-transforms/src/flowsint_transforms/domain/to_ip.py @@ -1,16 +1,14 @@ import socket -from typing import List, Union +from typing import List from flowsint_core.core.logger import Logger from flowsint_core.core.transform_base import Transform from flowsint_types.domain import Domain from flowsint_types.ip import Ip -from flowsint_core.utils import is_valid_domain, is_root_domain class ResolveTransform(Transform): """Resolve domain names to IP addresses.""" - # Define the input and output types as class attributes InputType = List[Domain] OutputType = List[Ip] @@ -29,253 +27,7 @@ class ResolveTransform(Transform): @classmethod def documentation(cls) -> str: """Return formatted markdown documentation for the domain resolver transform.""" - return """ - # Domain Resolver Transform - - Resolve domain names to their corresponding IP addresses using DNS queries. This transform performs forward DNS resolution to discover the IP addresses associated with domain names and subdomains. - - ## Overview - - The Domain Resolver Transform takes domain names as input and returns their resolved IP addresses. It automatically handles different input formats and validates domains before attempting resolution. - - ## Input/Output Types - - - **Input**: `List[Domain]` - Array of domain objects to resolve - - **Output**: `List[Ip]` - Array of resolved IP addresses - - ## Input Format Support - - The transform accepts multiple input formats and automatically converts them: - - ### String Format - - ```python - ["example.com", "subdomain.example.com"] - ``` - - ### Dictionary Format - - ```python - [ - {"domain": "example.com"}, - {"domain": "subdomain.example.com"} - ] - ``` - - ### Domain Object Format - - ```python - [ - Domain(domain="example.com", root=True), - Domain(domain="subdomain.example.com", root=False) - ] - ``` - - ## Resolution Process - - ### 1. Input Validation - - - Validates domain format using built-in validation - - Determines if domain is root domain or subdomain - - Filters out invalid domains - - ### 2. DNS Resolution - - - Uses Python's `socket.gethostbyname()` for DNS queries - - Resolves each domain to its primary A record - - Handles resolution errors gracefully - - ### 3. Result Storage - - - Stores domain-to-IP relationships in Neo4j graph database - - Creates nodes for both domains and IP addresses - - Establishes `RESOLVES_TO` relationships - - ## Example Usage - - ### Basic Domain Resolution - - **Input:** - ```json - [ - "google.com", - "github.com", - "stackoverflow.com" - ] - ``` - - **Expected Output:** - ```json - [ - {"address": "142.250.191.14"}, - {"address": "140.82.113.4"}, - {"address": "151.101.193.69"} - ] - ``` - - ### Mixed Input Types - - **Input:** - ```json - [ - "example.com", - {"domain": "subdomain.example.com"}, - {"domain": "api.example.com", "root": false} - ] - ``` - - ## Graph Database Storage - - ### Node Creation - - **Domain Node:** - ```cypher - MERGE (d:domain {domain: "example.com"}) - SET d.sketch_id = "sketch-uuid", - d.label = "example.com", - d.type = "domain" // or "subdomain" - ``` - - **IP Node:** - ```cypher - MERGE (ip:ip {address: "93.184.216.34"}) - SET ip.sketch_id = "sketch-uuid", - ip.label = "93.184.216.34", - ip.type = "ip" - ``` - - ### Relationship Creation - - ```cypher - MERGE (d)-[:RESOLVES_TO {sketch_id: "sketch-uuid"}]->(ip) - ``` - - ## Domain Type Classification - - The transform automatically classifies domains: - - - **Root Domain**: `example.com` → `type: "domain"` - - **Subdomain**: `api.example.com` → `type: "subdomain"` - - ## Error Handling - - ### Resolution Failures - - When DNS resolution fails, the transform: - - - Logs the error with domain name - - Continues processing remaining domains - - Does not create nodes for failed resolutions - - Common resolution failures: - - - **NXDOMAIN**: Domain does not exist - - **Timeout**: DNS server not responding - - **Network errors**: Connectivity issues - - ### Invalid Input Handling - - The transform filters out: - - - Malformed domain names - - Empty strings - - Non-string, non-dict, non-Domain inputs - - ## Performance Considerations - - ### Resolution Speed - - - Sequential DNS queries (not parallelized) - - Typical resolution time: 10-100ms per domain - - Consider batch size for large domain lists - - ### DNS Caching - - - Relies on system DNS cache - - Results may vary based on TTL values - - Fresh queries may take longer than cached ones - - ## Use Cases - - ### Investigation Scenarios - - 1. **Domain Enumeration**: Resolve discovered subdomains to find active hosts - 2. **Infrastructure Mapping**: Map domain-to-IP relationships for target organization - 3. **CDN Detection**: Identify content delivery network usage patterns - 4. **IP Pivoting**: Find shared hosting infrastructure across domains - - ### Workflow Integration - - ``` - [Domain Discovery] → [Domain Resolver] → [IP Geolocation] - → [Port Transform] - → [ASN Lookup] - ``` - - ## Security Considerations - - - **DNS Leakage**: Resolution queries may be logged by DNS providers - - **Rate Limiting**: Some DNS servers may rate limit queries - - **Privacy**: Consider using secure DNS (DoH/DoT) for sensitive investigations - - ## Limitations - - - **IPv4 Only**: Currently resolves only A records (IPv4) - - **Single IP**: Returns only the first resolved IP address - - **No CNAME Following**: Does not follow CNAME chains - - **No Cache Control**: Cannot force fresh DNS queries - - ## Troubleshooting - - ### Common Issues - - 1. **No Results**: Check domain validity and DNS configuration - 2. **Timeouts**: Verify network connectivity and DNS server availability - 3. **Partial Results**: Some domains may fail while others succeed - - ### Debug Information - - The transform provides logging for: - - Input validation results - - DNS resolution attempts - - Graph database operations - - Error conditions - - Check Flowsint logs for detailed resolution information. - - ## Technical Details - - ### DNS Query Method - - - Uses Python's standard library `socket.gethostbyname()` - - Follows system DNS configuration - - Respects `/etc/hosts` file entries on Unix systems - - ### Graph Integration - - - Creates typed nodes in Neo4j - - Maintains investigation context via `sketch_id` - - Supports graph traversal for relationship analysis - """ - - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - domain_obj = None - if isinstance(item, str): - domain_obj = Domain(domain=item, root=is_root_domain(item)) - elif isinstance(item, dict) and "domain" in item: - domain_obj = Domain( - domain=item["domain"], root=is_root_domain(item["domain"]) - ) - elif isinstance(item, Domain): - # If the Domain object already exists, update its root field - domain_obj = Domain( - domain=item.domain, root=is_root_domain(item.domain) - ) - if domain_obj and is_valid_domain(domain_obj.domain): - cleaned.append(domain_obj) - return cleaned + return "" async def scan(self, data: InputType) -> OutputType: results: OutputType = [] @@ -293,20 +45,11 @@ class ResolveTransform(Transform): def postprocess(self, results: OutputType, original_input: InputType) -> OutputType: for domain_obj, ip_obj in zip(original_input, results): - self.create_node( - "domain", - "domain", - domain_obj.domain, - type="domain", - ) - self.create_node("ip", "address", ip_obj.address, **ip_obj.__dict__) + self.create_node(domain_obj) + self.create_node(ip_obj) self.create_relationship( - "domain", - "domain", - domain_obj.domain, - "ip", - "address", - ip_obj.address, + domain_obj, + ip_obj, "RESOLVES_TO", ) self.log_graph_message( @@ -315,6 +58,5 @@ class ResolveTransform(Transform): return results -# Make types available at module level for easy access InputType = ResolveTransform.InputType OutputType = ResolveTransform.OutputType diff --git a/flowsint-transforms/src/flowsint_transforms/domain/to_root_domain.py b/flowsint-transforms/src/flowsint_transforms/domain/to_root_domain.py index c5e99ee..73fee12 100644 --- a/flowsint-transforms/src/flowsint_transforms/domain/to_root_domain.py +++ b/flowsint-transforms/src/flowsint_transforms/domain/to_root_domain.py @@ -28,22 +28,6 @@ class DomainToRootDomain(Transform): def key(cls) -> str: return "domain" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - domain_obj = None - if isinstance(item, str): - if is_valid_domain(item): - domain_obj = Domain(domain=item) - elif isinstance(item, dict) and "domain" in item: - if is_valid_domain(item["domain"]): - domain_obj = Domain(domain=item["domain"]) - elif isinstance(item, Domain): - domain_obj = item - if domain_obj: - cleaned.append(domain_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] self.domain_root_mapping = [] # Reset mapping @@ -73,22 +57,13 @@ class DomainToRootDomain(Transform): if not self.neo4j_conn: continue - # Create root domain node - self.create_node("domain", "domain", root_domain.domain, **root_domain.__dict__) - - # Create original domain node - self.create_node("domain", "domain", original_domain.domain, **original_domain.__dict__) + # New simplified pattern: pass Pydantic objects directly + # Override type when needed + self.create_node(root_domain) + self.create_node(original_domain) # Create relationship from root domain to original domain - self.create_relationship( - "domain", - "domain", - root_domain.domain, - "domain", - "domain", - original_domain.domain, - "HAS_SUBDOMAIN", - ) + self.create_relationship(root_domain, original_domain, "HAS_SUBDOMAIN") self.log_graph_message( f"{root_domain.domain} -> HAS_SUBDOMAIN -> {original_domain.domain}" diff --git a/flowsint-transforms/src/flowsint_transforms/domain/to_subdomains.py b/flowsint-transforms/src/flowsint_transforms/domain/to_subdomains.py index 4efb927..6f0b601 100644 --- a/flowsint-transforms/src/flowsint_transforms/domain/to_subdomains.py +++ b/flowsint-transforms/src/flowsint_transforms/domain/to_subdomains.py @@ -12,7 +12,7 @@ class SubdomainTransform(Transform): """Transform to find subdomains associated with a domain.""" # Define types as class attributes - base class handles schema generation automatically - InputType = List[Domain | str] + InputType = List[Domain] OutputType = List[Domain] @classmethod @@ -27,20 +27,6 @@ class SubdomainTransform(Transform): def key(cls) -> str: return "domain" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - domain_obj = None - if isinstance(item, str): - domain_obj = Domain(domain=item) - elif isinstance(item, dict) and "domain" in item: - domain_obj = Domain(domain=item["domain"]) - elif isinstance(item, Domain): - domain_obj = item - if domain_obj and is_valid_domain(domain_obj.domain): - cleaned.append(domain_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: """Find subdomains using subfinder (Docker) or fallback to crt.sh.""" domains: OutputType = [] @@ -115,18 +101,12 @@ class SubdomainTransform(Transform): ) # Create subdomain node - self.create_node("domain", "domain", subdomain, domain=subdomain) + parent_domain_obj = Domain(domain=domain_obj["domain"]) + subdomain_obj = Domain(domain=subdomain) + self.create_node(subdomain_obj) # Create relationship from parent domain to subdomain - self.create_relationship( - "domain", - "domain", - domain_obj["domain"], - "domain", - "domain", - subdomain, - "HAS_SUBDOMAIN", - ) + self.create_relationship(parent_domain_obj, subdomain_obj, "HAS_SUBDOMAIN") self.log_graph_message( f"{domain_obj['domain']} -> {len(domain_obj['subdomains'])} subdomain(s) found." diff --git a/flowsint-transforms/src/flowsint_transforms/domain/to_website.py b/flowsint-transforms/src/flowsint_transforms/domain/to_website.py index 271292b..a90432b 100644 --- a/flowsint-transforms/src/flowsint_transforms/domain/to_website.py +++ b/flowsint-transforms/src/flowsint_transforms/domain/to_website.py @@ -26,22 +26,6 @@ class DomainToWebsiteTransform(Transform): def key(cls) -> str: return "domain" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - domain_obj = None - if isinstance(item, str): - if is_valid_domain(item): - domain_obj = Domain(domain=item) - elif isinstance(item, dict) and "domain" in item: - if is_valid_domain(item["domain"]): - domain_obj = Domain(domain=item["domain"]) - elif isinstance(item, Domain): - domain_obj = item - if domain_obj: - cleaned.append(domain_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] for domain in data: @@ -100,34 +84,13 @@ class DomainToWebsiteTransform(Transform): if self.neo4j_conn: # Create domain node - self.create_node( - "domain", "domain", website.domain.domain, type="domain" - ) + self.create_node(website.domain) # Create website node - self.create_node( - "website", - "url", - str(website.url), - active=website.active, - redirects=( - [str(redirect) for redirect in website.redirects] - if website.redirects - else [] - ), - type="website", - ) + self.create_node(website) # Create relationship - self.create_relationship( - "domain", - "domain", - website.domain.domain, - "website", - "url", - str(website.url), - "HAS_WEBSITE", - ) + self.create_relationship(website.domain, website, "HAS_WEBSITE") is_active_str = "active" if website.active else "inactive" redirects_str = ( diff --git a/flowsint-transforms/src/flowsint_transforms/domain/to_whois.py b/flowsint-transforms/src/flowsint_transforms/domain/to_whois.py index 681a087..e00128c 100644 --- a/flowsint-transforms/src/flowsint_transforms/domain/to_whois.py +++ b/flowsint-transforms/src/flowsint_transforms/domain/to_whois.py @@ -4,6 +4,7 @@ from flowsint_core.core.transform_base import Transform from flowsint_types.domain import Domain from flowsint_types.whois import Whois from flowsint_types.email import Email +from flowsint_types.organization import Organization from flowsint_core.core.logger import Logger @@ -77,13 +78,18 @@ class WhoisTransform(Transform): elif hasattr(whois_info, "domain_id") and whois_info.domain_id: registry_domain_id = str(whois_info.domain_id) + # Create organization object if org info is available + organization = None + if whois_info.org: + organization = Organization(name=str(whois_info.org)) + whois_obj = Whois( domain=domain, registry_domain_id=registry_domain_id, registrar=( str(whois_info.registrar) if whois_info.registrar else None ), - org=str(whois_info.org) if whois_info.org else None, + organization=organization, city=str(whois_info.city) if whois_info.city else None, country=str(whois_info.country) if whois_info.country else None, email=emails[0] if emails else None, @@ -107,98 +113,62 @@ class WhoisTransform(Transform): continue # Create domain node - self.create_node( - "domain", - "domain", - whois_obj.domain.domain, - root=whois_obj.domain.root, - type="domain", - ) + self.create_node(whois_obj.domain) - # Create whois node + # Create whois node with custom key (only primitive fields) whois_key = f"{whois_obj.domain.domain}_{self.sketch_id}" + whois_props = { + "registry_domain_id": whois_obj.registry_domain_id, + "registrar": whois_obj.registrar, + "city": whois_obj.city, + "country": whois_obj.country, + "creation_date": whois_obj.creation_date, + "expiration_date": whois_obj.expiration_date, + "label": whois_obj.label, + } + self.create_node("whois", "whois_id", whois_key, **whois_props) - if whois_obj.registry_domain_id: - whois_label = whois_obj.registry_domain_id - else: - whois_label = whois_obj.domain.domain - self.create_node( - "whois", - "whois_id", - whois_key, - domain=whois_obj.domain.domain, + # Create relationship between domain and whois + domain_obj = Domain(domain=whois_obj.domain.domain) + whois_node = Whois( + domain=domain_obj, registry_domain_id=whois_obj.registry_domain_id, registrar=whois_obj.registrar, - org=whois_obj.org, city=whois_obj.city, country=whois_obj.country, creation_date=whois_obj.creation_date, expiration_date=whois_obj.expiration_date, - email=whois_obj.email.email if whois_obj.email else None, - label=whois_label, - type="whois", ) + self.create_relationship(domain_obj, whois_node, "HAS_WHOIS") - # Create relationship between domain and whois - self.create_relationship( - "domain", - "domain", - whois_obj.domain.domain, - "whois", - "whois_id", - whois_key, - "HAS_WHOIS", - ) - - # Create organization node if org information is available - if whois_obj.org: - self.create_node( - "organization", - "name", - whois_obj.org, - country=whois_obj.country, - founding_date=whois_obj.creation_date, - description=f"Organization from WHOIS data for {whois_obj.domain.domain}", - caption=whois_obj.org, - type="organization", - ) + # Create organization node if organization information is available + if whois_obj.organization: + self.create_node(whois_obj.organization) # Create relationship between organization and domain - self.create_relationship( - "organization", - "name", - whois_obj.org, - "domain", - "domain", - whois_obj.domain.domain, - "HAS_DOMAIN", - ) + domain_obj2 = Domain(domain=whois_obj.domain.domain) + self.create_relationship(whois_obj.organization, domain_obj2, "HAS_DOMAIN") self.log_graph_message( - f"{whois_obj.domain.domain} -> {whois_obj.org} (organization)" + f"{whois_obj.domain.domain} -> {whois_obj.organization.name} (organization)" ) + # Create email node if email information is available if whois_obj.email: - self.create_node( - "email", "email", whois_obj.email.email, **whois_obj.email.__dict__ - ) - self.create_relationship( - "whois", - "whois_id", - whois_key, - "email", - "email", - whois_obj.email.email, - "REGISTERED_BY", + self.create_node(whois_obj.email) + whois_node2 = Whois( + domain=Domain(domain=whois_obj.domain.domain), + registry_domain_id=whois_obj.registry_domain_id, + registrar=whois_obj.registrar, + city=whois_obj.city, + country=whois_obj.country, + creation_date=whois_obj.creation_date, + expiration_date=whois_obj.expiration_date, ) + self.create_relationship(whois_node2, whois_obj.email, "REGISTERED_BY") self.log_graph_message( - f"WHOIS for {whois_obj.domain.domain} -> registry_id: {whois_obj.registry_domain_id} registrar: {whois_obj.registrar} org: {whois_obj.org} city: {whois_obj.city} country: {whois_obj.country} creation_date: {whois_obj.creation_date} expiration_date: {whois_obj.expiration_date}" + f"WHOIS for {whois_obj.domain.domain} -> registry_id: {whois_obj.registry_domain_id} registrar: {whois_obj.registrar} org: {whois_obj.organization.name if whois_obj.organization else None} city: {whois_obj.city} country: {whois_obj.country} creation_date: {whois_obj.creation_date} expiration_date: {whois_obj.expiration_date}" ) return results - - -# Make types available at module level for easy access -InputType = WhoisTransform.InputType -OutputType = WhoisTransform.OutputType diff --git a/flowsint-transforms/src/flowsint_transforms/email/to_domains.py b/flowsint-transforms/src/flowsint_transforms/email/to_domains.py index 33c7a56..5a66a2c 100644 --- a/flowsint-transforms/src/flowsint_transforms/email/to_domains.py +++ b/flowsint-transforms/src/flowsint_transforms/email/to_domains.py @@ -5,6 +5,7 @@ from flowsint_core.core.transform_base import Transform from flowsint_types.domain import Domain from flowsint_types.individual import Individual from flowsint_types.email import Email +from flowsint_types.phone import Phone from flowsint_types.address import Location from flowsint_core.core.logger import Logger from flowsint_core.core.graph_db import Neo4jConnection @@ -238,34 +239,15 @@ class EmailToDomainsTransform(Transform): processed_domains.add(domain_name) # Create email node - self.create_node( - "email", - "email", - email.email, - caption=email.email, - type="email", - ) + self.create_node(email) # Create domain node - self.create_node( - "domain", - "domain", - domain_name, - label=domain_name, - caption=domain_name, - type="domain", - ) + domain_obj = Domain(domain=domain_name) + self.create_node(domain_obj) # Create relationship between email and domain - self.create_relationship( - "email", - "email", - email.email, - "domain", - "domain", - domain_name, - "HAS_REGISTERED_DOMAIN", - ) + domain_obj_email = Domain(domain=domain_name) + self.create_relationship(email, domain_obj_email, "HAS_REGISTERED_DOMAIN") # Process all contact types for contact_type, contact in contacts.items(): @@ -314,35 +296,15 @@ class EmailToDomainsTransform(Transform): processed_individuals.add(individual_id) # Create individual node - self.create_node( - "individual", - "full_name", - individual.full_name, - caption=individual.full_name, - type="individual", - ) + self.create_node(individual) # Create relationship between individual and domain - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "domain", - "domain", - domain_name, - f"IS_{contact_type}_CONTACT", - ) + domain_obj_ind = Domain(domain=domain_name) + self.create_relationship(individual, domain_obj_ind, f"IS_{contact_type}_CONTACT") # Create relationship between individual and email - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "email", - "email", - email_address, - f"WORKS_FOR", - ) + email_obj_ind = Email(email=email_address) + self.create_relationship(individual, email_obj_ind, "WORKS_FOR") # Process email addresses if individual.email_addresses: @@ -352,24 +314,11 @@ class EmailToDomainsTransform(Transform): processed_emails.add(email_str) # Create email node - self.create_node( - "email", - "email", - email_str, - caption=email_str, - type="email", - ) + email_node = Email(email=email_str) + self.create_node(email_node) # Create relationship between individual and email - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "email", - "email", - email_str, - "HAS_EMAIL", - ) + self.create_relationship(individual, email_node, "HAS_EMAIL") # Process phone numbers if individual.phone_numbers: @@ -379,24 +328,11 @@ class EmailToDomainsTransform(Transform): processed_phones.add(phone_str) # Create phone node - self.create_node( - "phone", - "number", - phone_str, - caption=phone_str, - type="phone", - ) + phone_node = Phone(number=phone_str) + self.create_node(phone_node) # Create relationship between individual and phone - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "phone", - "number", - phone_str, - "HAS_PHONE", - ) + self.create_relationship(individual, phone_node, "HAS_PHONE") # Process physical address address = self.__extract_physical_address(contact) @@ -408,24 +344,10 @@ class EmailToDomainsTransform(Transform): processed_addresses.add(address_id) # Create address node - self.create_node( - "location", - "address", - address.address, - caption=f"{address.address}, {address.city}", - type="location", - ) + self.create_node(address) # Create relationship between individual and address - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "location", - "address", - address.address, - "LIVES_AT", - ) + self.create_relationship(individual, address, "LIVES_AT") InputType = EmailToDomainsTransform.InputType diff --git a/flowsint-transforms/src/flowsint_transforms/email/to_gravatar.py b/flowsint-transforms/src/flowsint_transforms/email/to_gravatar.py index b6864fd..2b9299a 100644 --- a/flowsint-transforms/src/flowsint_transforms/email/to_gravatar.py +++ b/flowsint-transforms/src/flowsint_transforms/email/to_gravatar.py @@ -25,20 +25,6 @@ class EmailToGravatarTransform(Transform): def key(cls) -> str: return "email" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - email_obj = None - if isinstance(item, str): - email_obj = Email(email=item) - elif isinstance(item, dict) and "email" in item: - email_obj = Email(email=item["email"]) - elif isinstance(item, Email): - email_obj = item - if email_obj: - cleaned.append(email_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] @@ -100,26 +86,13 @@ class EmailToGravatarTransform(Transform): for email_obj, gravatar_obj in zip(original_input, results): if not self.neo4j_conn: continue - # Create email node - self.create_node("email", "email", email_obj.email, **email_obj.__dict__) - + self.create_node(email_obj) # Create gravatar node gravatar_key = f"{email_obj.email}_{self.sketch_id}" - self.create_node( - "gravatar", "gravatar_id", gravatar_key, **gravatar_obj.__dict__ - ) - + self.create_node(gravatar_obj) # Create relationship between email and gravatar - self.create_relationship( - "email", - "email", - email_obj.email, - "gravatar", - "gravatar_id", - gravatar_key, - "HAS_GRAVATAR", - ) + self.create_relationship(email_obj, gravatar_obj, "HAS_GRAVATAR") self.log_graph_message( f"Gravatar found for email {email_obj.email} -> hash: {gravatar_obj.hash}" diff --git a/flowsint-transforms/src/flowsint_transforms/email/to_leaks.py b/flowsint-transforms/src/flowsint_transforms/email/to_leaks.py index 610ccff..acd3b7f 100644 --- a/flowsint-transforms/src/flowsint_transforms/email/to_leaks.py +++ b/flowsint-transforms/src/flowsint_transforms/email/to_leaks.py @@ -73,20 +73,6 @@ class EmailToBreachesTransform(Transform): }, ] - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - email_obj = None - if isinstance(item, str): - email_obj = Email(email=item) - elif isinstance(item, dict) and "email" in item: - email_obj = Email(email=item["email"]) - elif isinstance(item, Email): - email_obj = item - if email_obj: - cleaned.append(email_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] api_key = self.get_secret("HIBP_API_KEY", os.getenv("HIBP_API_KEY")) @@ -150,7 +136,7 @@ class EmailToBreachesTransform(Transform): for email_obj in original_input: if not self.neo4j_conn: continue - self.create_node("email", "email", email_obj.email, **email_obj.__dict__) + self.create_node(email_obj) # Process all breaches for email_address, breach_obj in results: @@ -159,25 +145,11 @@ class EmailToBreachesTransform(Transform): # Create breach node breach_key = f"{breach_obj.name}_{self.sketch_id}" - self.create_node( - "breach", - "breach_id", - breach_key, - **breach_obj.dict(), - label=breach_obj.name, - type="breach", - ) + self.create_node(breach_obj) # Create relationship between the specific email and this breach - self.create_relationship( - "email", - "email", - email_address, - "breach", - "breach_id", - breach_key, - "FOUND_IN_BREACH", - ) + email_obj = Email(email=email_address) + self.create_relationship(email_obj, breach_obj, "FOUND_IN_BREACH") self.log_graph_message( f"Breach found for email {email_address} -> {breach_obj.name} ({breach_obj.title})" ) diff --git a/flowsint-transforms/src/flowsint_transforms/individual/to_domains.py b/flowsint-transforms/src/flowsint_transforms/individual/to_domains.py index c04b310..235a140 100644 --- a/flowsint-transforms/src/flowsint_transforms/individual/to_domains.py +++ b/flowsint-transforms/src/flowsint_transforms/individual/to_domains.py @@ -139,7 +139,9 @@ class IndividualToDomainsTransform(Transform): ) return domains - def __get_infos_from_whoxy(self, individual_name: str, api_key: str) -> Dict[str, Any]: + def __get_infos_from_whoxy( + self, individual_name: str, api_key: str + ) -> Dict[str, Any]: infos: Dict[str, Any] = {} whoxy = WhoxyTool() try: @@ -243,9 +245,7 @@ class IndividualToDomainsTransform(Transform): if not all([address, city, zip_code, country]): return None - return Location( - address=address, city=city, zip=zip_code, country=country - ) + return Location(address=address, city=city, zip=zip_code, country=country) def postprocess(self, results: OutputType, original_input: InputType) -> OutputType: """Create Neo4j nodes and relationships from extracted data.""" @@ -269,33 +269,14 @@ class IndividualToDomainsTransform(Transform): if domain_name in processed_domains: continue processed_domains.add(domain_name) - # Create individual node - self.create_node( - "individual", - "full_name", - individual.full_name, - **individual.__dict__, - ) - + self.create_node(individual) # Create domain node - self.create_node( - "domain", - "domain", - domain_name, - **domain.__dict__, - ) + self.create_node(domain) # Create relationship between individual and domain - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "domain", - "domain", - domain_name, - "HAS_REGISTERED_DOMAIN", - ) + domain_obj_indiv = Domain(domain=domain_name) + self.create_relationship(individual, domain_obj_indiv, "HAS_REGISTERED_DOMAIN") # Process all contact types for contact_type, contact in contacts.items(): @@ -342,36 +323,16 @@ class IndividualToDomainsTransform(Transform): return processed_individuals.add(individual_id) - # Create individual node - self.create_node( - "individual", - "full_name", - contact_individual.full_name, - **contact_individual.__dict__, - ) + self.create_node(contact_individual) # Create relationship between individual and domain - self.create_relationship( - "individual", - "full_name", - contact_individual.full_name, - "domain", - "domain", - domain_name, - f"IS_{contact_type}_CONTACT", - ) + domain_obj_contact = Domain(domain=domain_name) + self.create_relationship(contact_individual, domain_obj_contact, f"IS_{contact_type}_CONTACT") # Create relationship between contact individual and main individual - self.create_relationship( - "individual", - "full_name", - contact_individual.full_name, - "individual", - "full_name", - individual_name, - f"WORKS_FOR", - ) + main_individual = Individual(first_name="", last_name="", full_name=individual_name) + self.create_relationship(contact_individual, main_individual, "WORKS_FOR") # Process email addresses if contact_individual.email_addresses: @@ -381,23 +342,11 @@ class IndividualToDomainsTransform(Transform): processed_emails.add(email_str) # Create email node - self.create_node( - "email", - "email", - email_str, - email=email_str, - ) + email_obj = Email(email=email_str) + self.create_node(email_obj) # Create relationship between individual and email - self.create_relationship( - "individual", - "full_name", - contact_individual.full_name, - "email", - "email", - email_str, - "HAS_EMAIL", - ) + self.create_relationship(contact_individual, email_obj, "HAS_EMAIL") # Process phone numbers if contact_individual.phone_numbers: @@ -407,23 +356,11 @@ class IndividualToDomainsTransform(Transform): processed_phones.add(phone_str) # Create phone node - self.create_node( - "phone", - "number", - phone_str, - number=phone_str, - ) + phone_obj = Phone(number=phone_str) + self.create_node(phone_obj) # Create relationship between individual and phone - self.create_relationship( - "individual", - "full_name", - contact_individual.full_name, - "phone", - "number", - phone_str, - "HAS_PHONE", - ) + self.create_relationship(contact_individual, phone_obj, "HAS_PHONE") # Process physical address address = self.__extract_physical_address(contact) @@ -435,23 +372,10 @@ class IndividualToDomainsTransform(Transform): processed_addresses.add(address_id) # Create address node - self.create_node( - "location", - "address", - address.address, - address=address, - ) + self.create_node(address) # Create relationship between individual and address - self.create_relationship( - "individual", - "full_name", - contact_individual.full_name, - "location", - "address", - address.address, - "LIVES_AT", - ) + self.create_relationship(contact_individual, address, "LIVES_AT") # Make types available at module level for easy access diff --git a/flowsint-transforms/src/flowsint_transforms/individual/to_org.py b/flowsint-transforms/src/flowsint_transforms/individual/to_org.py index adb6549..c25c88d 100644 --- a/flowsint-transforms/src/flowsint_transforms/individual/to_org.py +++ b/flowsint-transforms/src/flowsint_transforms/individual/to_org.py @@ -293,12 +293,7 @@ class IndividualToOrgTransform(Transform): continue processed_organizations.add(org_key) - self.create_node( - "Organization", - "org_id", - org_key, - **org.__dict__, - ) + self.create_node(org) # Then, create all individual nodes for individual in original_input: @@ -308,12 +303,7 @@ class IndividualToOrgTransform(Transform): processed_individuals.add(individual_id) # Create individual node - self.create_node( - "individual", - "full_name", - individual_id, - **individual.__dict__, - ) + self.create_node(individual) # Finally, create relationships between all individuals and all organizations for individual in original_input: @@ -322,15 +312,7 @@ class IndividualToOrgTransform(Transform): org_key = f"{org.name}_FR" # Create relationship between individual and organization - self.create_relationship( - "individual", - "full_name", - individual_id, - "Organization", - "org_id", - org_key, - "WORKS_FOR", - ) + self.create_relationship(individual, org, "WORKS_FOR") self.log_graph_message( f"Created {len(results)} organizations and {len(original_input)} individuals with relationships" diff --git a/flowsint-transforms/src/flowsint_transforms/ip/to_asn.py b/flowsint-transforms/src/flowsint_transforms/ip/to_asn.py index 9021407..4024b3e 100644 --- a/flowsint-transforms/src/flowsint_transforms/ip/to_asn.py +++ b/flowsint-transforms/src/flowsint_transforms/ip/to_asn.py @@ -62,22 +62,6 @@ class IpToAsnTransform(Transform): def key(cls) -> str: return "address" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - ip_obj = None - if isinstance(item, str): - if is_valid_ip(item): - ip_obj = Ip(address=item) - elif isinstance(item, dict) and "address" in item: - if is_valid_ip(item["address"]): - ip_obj = Ip(address=item["address"]) - elif isinstance(item, Ip): - ip_obj = item - if ip_obj: - cleaned.append(ip_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] asnmap = AsnmapTool() @@ -130,19 +114,11 @@ class IpToAsnTransform(Transform): if input_data and self.neo4j_conn: for ip, asn in zip(input_data, results): # Create IP node - self.create_node("ip", "address", ip.address, label=ip.address, type="ip", **ip.__dict__) + self.create_node(ip) # Create ASN node - self.create_node("asn", "number", asn.number, label=f"AS{asn.number}", type="asn", **asn.__dict__) + self.create_node(asn) # Create relationship - self.create_relationship( - "ip", - "address", - ip.address, - "asn", - "number", - asn.number, - "BELONGS_TO", - ) + self.create_relationship(ip, asn, "BELONGS_TO") self.log_graph_message( f"IP {ip.address} belongs to AS{asn.number} ({asn.name})" ) diff --git a/flowsint-transforms/src/flowsint_transforms/ip/to_domain.py b/flowsint-transforms/src/flowsint_transforms/ip/to_domain.py index 2cecd10..3de35ab 100644 --- a/flowsint-transforms/src/flowsint_transforms/ip/to_domain.py +++ b/flowsint-transforms/src/flowsint_transforms/ip/to_domain.py @@ -30,22 +30,6 @@ class ReverseResolveTransform(Transform): def key(cls) -> str: return "address" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - ip_obj = None - if isinstance(item, str): - if is_valid_ip(item): - ip_obj = Ip(address=item) - elif isinstance(item, dict) and "address" in item: - if is_valid_ip(item["address"]): - ip_obj = Ip(address=item["address"]) - elif isinstance(item, Ip): - ip_obj = item - if ip_obj: - cleaned.append(ip_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] @@ -89,29 +73,11 @@ class ReverseResolveTransform(Transform): # Create nodes and relationships for each resolved domain for ip_obj in original_input: # Create IP node - self.create_node("ip", "address", ip_obj.address, **ip_obj.__dict__) - + self.create_node(ip_obj) # Create domain nodes and relationships for each resolved domain for domain_obj in results: - self.create_node( - "domain", - "domain", - domain_obj.domain, - type=( - "domain" - if "." not in domain_obj.domain.split(".")[1:] - else "subdomain" - ), - ) - self.create_relationship( - "ip", - "address", - ip_obj.address, - "domain", - "domain", - domain_obj.domain, - "REVERSE_RESOLVES_TO", - ) + self.create_node(domain_obj) + self.create_relationship(ip_obj, domain_obj, "REVERSE_RESOLVES_TO") self.log_graph_message( f"Domain found for IP {ip_obj.address} -> {domain_obj.domain}" ) @@ -119,6 +85,5 @@ class ReverseResolveTransform(Transform): return results -# Make types available at module level for easy access InputType = ReverseResolveTransform.InputType OutputType = ReverseResolveTransform.OutputType diff --git a/flowsint-transforms/src/flowsint_transforms/ip/to_infos.py b/flowsint-transforms/src/flowsint_transforms/ip/to_infos.py index c76270e..6c5dd14 100644 --- a/flowsint-transforms/src/flowsint_transforms/ip/to_infos.py +++ b/flowsint-transforms/src/flowsint_transforms/ip/to_infos.py @@ -1,12 +1,7 @@ import requests -from typing import List, Dict, Any, TypeAlias, Union -from pydantic import TypeAdapter +from typing import List, Dict, Any from flowsint_core.core.transform_base import Transform -from flowsint_types.ip import Ip, Ip -from flowsint_core.utils import resolve_type, is_valid_ip - -InputType: TypeAlias = List[Ip] -OutputType: TypeAlias = List[Ip] +from flowsint_types.ip import Ip class IpToInfosTransform(Transform): @@ -28,60 +23,18 @@ class IpToInfosTransform(Transform): def key(cls) -> str: return "address" - @classmethod - def input_schema(cls) -> Dict[str, Any]: - adapter = TypeAdapter(InputType) - schema = adapter.json_schema() - type_name, details = list(schema["$defs"].items())[0] - return { - "type": type_name, - "properties": [ - {"name": prop, "type": resolve_type(info, schema)} - for prop, info in details["properties"].items() - ], - } - - @classmethod - def output_schema(cls) -> Dict[str, Any]: - adapter = TypeAdapter(OutputType) - schema = adapter.json_schema() - type_name, details = list(schema["$defs"].items())[0] - return { - "type": type_name, - "properties": [ - {"name": prop, "type": resolve_type(info, schema)} - for prop, info in details["properties"].items() - ], - } - - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - ip_obj = None - if isinstance(item, str): - ip_obj = Ip(address=item) - elif isinstance(item, dict) and "address" in item: - ip_obj = Ip(address=item["address"]) - elif isinstance(item, Ip): - ip_obj = item - if ip_obj and is_valid_ip(ip_obj.address): - cleaned.append(ip_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] for ip in data: try: geo_data = self.get_location_data(ip.address) - enriched_ip = Ip( - address=ip.address, - latitude=geo_data.get("latitude"), - longitude=geo_data.get("longitude"), - country=geo_data.get("country"), - city=geo_data.get("city"), - isp=geo_data.get("isp"), - ) - results.append(enriched_ip) + # Enrich the existing IP object with geo data + ip.latitude = geo_data.get("latitude") + ip.longitude = geo_data.get("longitude") + ip.country = geo_data.get("country") + ip.city = geo_data.get("city") + ip.isp = geo_data.get("isp") + results.append(ip) except Exception as e: print(f"Error geolocating {ip.address}: {e}") return results @@ -90,17 +43,7 @@ class IpToInfosTransform(Transform): """Update IP nodes in Neo4j with geolocation information.""" if self.neo4j_conn: for ip in results: - self.create_node( - "ip", - "address", - ip.address, - latitude=ip.latitude, - longitude=ip.longitude, - country=ip.country, - city=ip.city, - isp=ip.isp, - type="ip", - ) + self.create_node(ip) self.log_graph_message( f"Geolocated {ip.address} to {ip.city}, {ip.country} (lat: {ip.latitude}, lon: {ip.longitude})" ) @@ -129,7 +72,3 @@ class IpToInfosTransform(Transform): except Exception as e: print(f"Failed to geolocate {address}: {e}") return {} - - -InputType = IpToInfosTransform.InputType -OutputType = IpToInfosTransform.OutputType diff --git a/flowsint-transforms/src/flowsint_transforms/ip/to_ports.py b/flowsint-transforms/src/flowsint_transforms/ip/to_ports.py index f97f8e0..2b9d2c3 100644 --- a/flowsint-transforms/src/flowsint_transforms/ip/to_ports.py +++ b/flowsint-transforms/src/flowsint_transforms/ip/to_ports.py @@ -107,22 +107,6 @@ class IpToPortsTransform(Transform): def key(cls) -> str: return "address" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - ip_obj = None - if isinstance(item, str): - if is_valid_ip(item): - ip_obj = Ip(address=item) - elif isinstance(item, dict) and "address" in item: - if is_valid_ip(item["address"]): - ip_obj = Ip(**item) - elif isinstance(item, Ip): - ip_obj = item - if ip_obj: - cleaned.append(ip_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] naabu = NaabuTool() @@ -214,34 +198,13 @@ class IpToPortsTransform(Transform): ip_address = getattr(port, "_ip_address", None) if not ip_address: continue - - # Create Port node with composite key (ip:port) to handle multiple IPs + port_id = f"{ip_address}:{port.number}" - port_label = f"{port.number}/{port.protocol}" - self.create_node( - "port", - "id", - port_id, - label=port_label, - type="port", - number=port.number, - protocol=port.protocol, - state=port.state, - service=port.service, - banner=port.banner, - ip_address=ip_address, - ) + self.create_node(port) # Create relationship from IP to Port - self.create_relationship( - "ip", - "address", - ip_address, - "port", - "id", - port_id, - "HAS_PORT", - ) + ip_obj = Ip(address=ip_address) + self.create_relationship(ip_obj, port, "HAS_PORT") service_info = f" ({port.service})" if port.service else "" self.log_graph_message( diff --git a/flowsint-transforms/src/flowsint_transforms/leak/to_leaks.py b/flowsint-transforms/src/flowsint_transforms/leak/to_leaks.py deleted file mode 100644 index 7555646..0000000 --- a/flowsint-transforms/src/flowsint_transforms/leak/to_leaks.py +++ /dev/null @@ -1,156 +0,0 @@ -from typing import Dict, Any, List, Union, Optional -import hibpwned -from flowsint_core.core.transform_base import Transform -from flowsint_core.core.logger import Logger -from flowsint_core.core.graph_db import Neo4jConnection -import os -from dotenv import load_dotenv - -# Load environment variables -load_dotenv() - -HIBP_API_KEY = os.getenv("HIBP_API_KEY") - - -class HibpTransform(Transform): - """Queries HaveIBeenPwned for potential leaks.""" - - # Define types as class attributes - base class handles schema generation automatically - InputType = List[str] # Email addresses as strings - OutputType = List[Dict[str, Any]] # Breach results as dictionaries - - def __init__( - self, - sketch_id: Optional[str] = None, - scan_id: Optional[str] = None, - neo4j_conn: Optional[Neo4jConnection] = None, - vault=None, - params: Optional[Dict[str, Any]] = None, - ): - super().__init__( - sketch_id=sketch_id, - scan_id=scan_id, - neo4j_conn=neo4j_conn, - params_schema=self.get_params_schema(), - vault=vault, - params=params, - ) - - @classmethod - def required_params(cls) -> bool: - return True - - @classmethod - def get_params_schema(cls) -> List[Dict[str, Any]]: - """Declare required parameters for this transform""" - return [ - { - "name": "HIBP_API_KEY", - "type": "vaultSecret", - "description": "The HIBP API key to use for breach lookups.", - "required": True, - }, - ] - - @classmethod - def name(cls) -> str: - return "to_hibp_leaks" - - @classmethod - def category(cls) -> str: - return "leaks" - - @classmethod - def key(cls) -> str: - return "email" - - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - if isinstance(item, str): - cleaned.append(item) - elif isinstance(item, dict) and "email" in item: - cleaned.append(item["email"]) - return cleaned - - async def scan(self, data: InputType) -> OutputType: - """Performs a search on HaveIBeenPwned for a list of emails.""" - results: OutputType = [] - api_key = self.get_secret("HIBP_API_KEY", os.getenv("HIBP_API_KEY")) - - for email in data: - try: - result = hibpwned.Pwned(email, "MyHIBPChecker", api_key) - - # Clear data structure for results - breaches = result.search_all_breaches() - pastes = result.search_pastes() - password = result.search_password("BadPassword") - hashes = result.search_hashes("21BD1") - - email_result = { - "email": email, - "breaches": breaches if breaches else [], - "adobe": result.single_breach("adobe") or {}, - "data": result.data_classes() or [], - "pastes": pastes if pastes else [], - "password": password if password else {}, - "hashes": hashes if hashes else [], - } - results.append(email_result) - except Exception as e: - results.append( - { - "email": email, - "error": f"Error during scan: {str(e)}", - } - ) - Logger.error( - self.sketch_id, - {"message": f"Error scanning email {email}: {str(e)}"}, - ) - - return results - - def postprocess(self, results: OutputType, original_input: InputType) -> OutputType: - """Create Neo4j relationships for found breaches.""" - if not self.neo4j_conn: - return results - - for result in results: - if "error" not in result: - email = result["email"] - - # Create email node - self.create_node("email", "address", email, email=email) - - # Create breach relationships - for breach in result.get("breaches", []): - if breach and isinstance(breach, dict): - breach_name = breach.get("Name", "Unknown") - self.create_node( - "breach", - "name", - breach_name, - caption=breach_name, - type="breach", - ) - self.create_relationship( - "email", - "address", - email, - "breach", - "name", - breach_name, - "FOUND_IN_BREACH", - ) - self.log_graph_message( - f"Email {email} found in breach: {breach_name}" - ) - - return results - - -# Make types available at module level for easy access -InputType = HibpTransform.InputType -OutputType = HibpTransform.OutputType diff --git a/flowsint-transforms/src/flowsint_transforms/organization/__init__.py b/flowsint-transforms/src/flowsint_transforms/organization/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/flowsint-transforms/src/flowsint_transforms/organization/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/flowsint-transforms/src/flowsint_transforms/organization/to_asn.py b/flowsint-transforms/src/flowsint_transforms/organization/to_asn.py index e68486f..645f4eb 100644 --- a/flowsint-transforms/src/flowsint_transforms/organization/to_asn.py +++ b/flowsint-transforms/src/flowsint_transforms/organization/to_asn.py @@ -60,20 +60,6 @@ class OrgToAsnTransform(Transform): def key(cls) -> str: return "name" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - org_obj = None - if isinstance(item, str): - org_obj = Organization(name=item) - elif isinstance(item, dict) and "name" in item: - org_obj = Organization(name=item["name"]) - elif isinstance(item, Organization): - org_obj = item - if org_obj: - cleaned.append(org_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: """Find ASN information for organizations using asnmap.""" results: OutputType = [] @@ -129,36 +115,13 @@ class OrgToAsnTransform(Transform): if self.neo4j_conn: # Create organization node - self.create_node( - "organization", - "name", - input_org.name, - caption=input_org.name, - type="organization", - ) + self.create_node(input_org) # Create ASN node - self.create_node( - "asn", - "number", - result_asn.number, - name=result_asn.name, - country=result_asn.country, - label=f"AS{result_asn.number}", - caption=f"AS{result_asn.number} - {result_asn.name}", - type="asn", - ) + self.create_node(result_asn) # Create relationship - self.create_relationship( - "organization", - "name", - input_org.name, - "asn", - "number", - result_asn.number, - "BELONGS_TO", - ) + self.create_relationship(input_org, result_asn, "BELONGS_TO") self.log_graph_message( f"Found for {input_org.name} -> ASN {result_asn.number}" diff --git a/flowsint-transforms/src/flowsint_transforms/organization/to_domains.py b/flowsint-transforms/src/flowsint_transforms/organization/to_domains.py index 67aa62b..b46bfd5 100644 --- a/flowsint-transforms/src/flowsint_transforms/organization/to_domains.py +++ b/flowsint-transforms/src/flowsint_transforms/organization/to_domains.py @@ -2,6 +2,7 @@ import os import re from typing import Any, List, Dict, Set, Optional from flowsint_core.core.transform_base import Transform +from flowsint_types import Email from flowsint_types.domain import Domain from flowsint_types.organization import Organization from flowsint_types.individual import Individual @@ -433,13 +434,8 @@ class OrgToDomainsTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating organization node: {org_name}"}, ) - self.create_node( - "organization", - "name", - org_name, - caption=org_name, - type="organization", - ) + org_obj = Organization(name=org_name) + self.create_node(org_obj) # Create domain node if not already processed if domain_name not in processed_domains: @@ -448,25 +444,13 @@ class OrgToDomainsTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating domain node: {domain_name}"}, ) - self.create_node( - "domain", - "domain", - domain_name, - label=domain_name, - caption=domain_name, - type="domain", - ) + domain_obj = Domain(domain=domain_name) + self.create_node(domain_obj) # Create relationship between organization and domain - self.create_relationship( - "organization", - "name", - org_name, - "domain", - "domain", - domain_name, - "HAS_REGISTERED_DOMAIN", - ) + org_obj_domain = Organization(name=org_name) + domain_obj_org = Domain(domain=domain_name) + self.create_relationship(org_obj_domain, domain_obj_org, "HAS_REGISTERED_DOMAIN") # Create individual node if not already processed individual_id = ( @@ -480,35 +464,15 @@ class OrgToDomainsTransform(Transform): "message": f"[WHOXY] Creating individual node: {individual.full_name}" }, ) - self.create_node( - "individual", - "full_name", - individual.full_name, - caption=individual.full_name, - type="individual", - ) + self.create_node(individual) # Create relationship between individual and domain - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "domain", - "domain", - domain_name, - f"IS_{contact_type.upper()}_CONTACT", - ) + domain_obj_ind = Domain(domain=domain_name) + self.create_relationship(individual, domain_obj_ind, f"IS_{contact_type.upper()}_CONTACT") # Create relationship between individual and organization - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "organization", - "name", - org_name, - "WORKS_FOR", - ) + org_obj_ind = Organization(name=org_name) + self.create_relationship(individual, org_obj_ind, "WORKS_FOR") # Process email addresses if individual.email_addresses: @@ -520,22 +484,9 @@ class OrgToDomainsTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating email node: {email_str}"}, ) - self.create_node( - "email", - "email", - email_str, - caption=email_str, - type="email", - ) - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "email", - "email", - email_str, - "HAS_EMAIL", - ) + email_obj = Email(email=email_str) + self.create_node(email_obj) + self.create_relationship(individual, email_obj, "HAS_EMAIL") # Process phone numbers if individual.phone_numbers: @@ -547,22 +498,9 @@ class OrgToDomainsTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating phone node: {phone_str}"}, ) - self.create_node( - "phone", - "number", - phone_str, - caption=phone_str, - type="phone", - ) - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "phone", - "number", - phone_str, - "HAS_PHONE", - ) + phone_obj = Phone(number=phone_str) + self.create_node(phone_obj) + self.create_relationship(individual, phone_obj, "HAS_PHONE") # Process physical address from contact data contact_data = individual_info["contact_data"] @@ -579,22 +517,8 @@ class OrgToDomainsTransform(Transform): "message": f"[WHOXY] Creating address node: {address.address}" }, ) - self.create_node( - "location", - "address", - address.address, - caption=f"{address.address}, {address.city}", - type="location", - ) - self.create_relationship( - "individual", - "full_name", - individual.full_name, - "location", - "address", - address.address, - "LIVES_AT", - ) + self.create_node(address) + self.create_relationship(individual, address, "LIVES_AT") self.log_graph_message( f"Processed individual {individual.full_name} ({contact_type}) for domain {domain_name}" @@ -621,13 +545,8 @@ class OrgToDomainsTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating organization node: {org_name}"}, ) - self.create_node( - "organization", - "name", - org_name, - caption=org_name, - type="organization", - ) + org_obj = Organization(name=org_name) + self.create_node(org_obj) # Create domain node if not already processed if domain_name not in processed_domains: @@ -636,25 +555,13 @@ class OrgToDomainsTransform(Transform): self.sketch_id, {"message": f"[WHOXY] Creating domain node: {domain_name}"}, ) - self.create_node( - "domain", - "domain", - domain_name, - label=domain_name, - caption=domain_name, - type="domain", - ) + domain_obj = Domain(domain=domain_name) + self.create_node(domain_obj) # Create relationship between input organization and domain - self.create_relationship( - "organization", - "name", - org_name, - "domain", - "domain", - domain_name, - "HAS_REGISTERED_DOMAIN", - ) + org_obj_domain2 = Organization(name=org_name) + domain_obj_org2 = Domain(domain=domain_name) + self.create_relationship(org_obj_domain2, domain_obj_org2, "HAS_REGISTERED_DOMAIN") # Create extracted organization node if not already processed if organization.name not in processed_organizations: @@ -665,24 +572,11 @@ class OrgToDomainsTransform(Transform): "message": f"[WHOXY] Creating organization node: {organization.name}" }, ) - self.create_node( - "organization", - "name", - organization.name, - caption=organization.name, - type="organization", - ) + self.create_node(organization) # Create relationship between extracted organization and domain - self.create_relationship( - "organization", - "name", - organization.name, - "domain", - "domain", - domain_name, - f"IS_{contact_type.upper()}_CONTACT", - ) + domain_obj_extracted = Domain(domain=domain_name) + self.create_relationship(organization, domain_obj_extracted, f"IS_{contact_type.upper()}_CONTACT") self.log_graph_message( f"Processed organization {organization.name} ({contact_type}) for domain {domain_name}" diff --git a/flowsint-transforms/src/flowsint_transforms/organization/to_infos.py b/flowsint-transforms/src/flowsint_transforms/organization/to_infos.py index 3f78691..bdf72d4 100644 --- a/flowsint-transforms/src/flowsint_transforms/organization/to_infos.py +++ b/flowsint-transforms/src/flowsint_transforms/organization/to_infos.py @@ -24,19 +24,6 @@ class OrgToInfosTransform(Transform): def key(cls) -> str: return "name" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - if not isinstance(data, list): - raise ValueError(f"Expected list input, got {type(data).__name__}") - cleaned: InputType = [] - for item in data: - if isinstance(item, str) and item != "": - cleaned.append(Organization(name=item)) - elif isinstance(item, dict) and "name" in item and item["name"] != "": - cleaned.append(Organization(**item)) - elif isinstance(item, Organization): - cleaned.append(item) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] @@ -266,35 +253,7 @@ class OrgToInfosTransform(Transform): for org in results: # Create or update the organization node with all SIRENE properties org_key = f"{org.name}_FR" - self.create_node( - "Organization", - "org_id", - org_key, - name=org.name, - country="FR", - siren=org.siren, - siege_siret=org.siege_siret, - nom_complet=org.nom_complet, - nom_raison_sociale=org.nom_raison_sociale, - sigle=org.sigle, - nombre_etablissements=org.nombre_etablissements, - nombre_etablissements_ouverts=org.nombre_etablissements_ouverts, - activite_principale=org.activite_principale, - section_activite_principale=org.section_activite_principale, - categorie_entreprise=org.categorie_entreprise, - annee_categorie_entreprise=org.annee_categorie_entreprise, - caractere_employeur=org.caractere_employeur, - tranche_effectif_salarie=org.tranche_effectif_salarie, - annee_tranche_effectif_salarie=org.annee_tranche_effectif_salarie, - date_creation=org.date_creation, - date_fermeture=org.date_fermeture, - date_mise_a_jour=org.date_mise_a_jour, - date_mise_a_jour_insee=org.date_mise_a_jour_insee, - date_mise_a_jour_rne=org.date_mise_a_jour_rne, - nature_juridique=org.nature_juridique, - statut_diffusion=org.statut_diffusion, - type="organization", - ) + self.create_node(org) if org.siren: self.log_graph_message(f"{org.name}: SIREN {org.siren} -> {org.name}") @@ -308,27 +267,9 @@ class OrgToInfosTransform(Transform): # Add dirigeants (leaders) as Individual nodes with relationships if org.dirigeants: for dirigeant in org.dirigeants: - self.create_node( - "individual", - "full_name", - dirigeant.full_name, - first_name=dirigeant.first_name, - last_name=dirigeant.last_name, - birth_date=dirigeant.birth_date, - gender=dirigeant.gender, - caption=dirigeant.full_name, - type="individual", - ) + self.create_node(dirigeant) - self.create_relationship( - "organization", - "org_id", - org_key, - "individual", - "full_name", - dirigeant.full_name, - "HAS_LEADER", - ) + self.create_relationship(org, dirigeant, "HAS_LEADER") self.log_graph_message( f"{org.name}: HAS_LEADER -> {dirigeant.full_name}" ) @@ -337,30 +278,9 @@ class OrgToInfosTransform(Transform): if org.siege_geo_adresse: address = org.siege_geo_adresse address_key = f"{address.address}_{address.city}_{address.country}" - self.create_node( - "location", - "address_id", - address_key, - address=address.address, - city=address.city, - country=address.country, - zip=address.zip, - latitude=address.latitude, - longitude=address.longitude, - label=f"{address.address}, {address.city}", - caption=f"{address.address}, {address.city}", - type="location", - ) + self.create_node(address) - self.create_relationship( - "organization", - "org_id", - org_key, - "location", - "address_id", - address_key, - "HAS_ADDRESS", - ) + self.create_relationship(org, address, "HAS_ADDRESS") self.log_graph_message( f"{org.name}: HAS_ADDRESS -> {address.address}, {address.city}" ) @@ -383,15 +303,15 @@ class OrgToInfosTransform(Transform): type="location", ) - self.create_relationship( - "organization", - "org_id", - org_key, - "Location", - "location_id", - location_key, - "LOCATED_AT", + location_obj = Location( + latitude=float(org.siege_latitude), + longitude=float(org.siege_longitude), + address=org.siege_adresse, + city=org.siege_libelle_commune, + country="FR", + zip=org.siege_code_postal, ) + self.create_relationship(org, location_obj, "LOCATED_AT") self.log_graph_message( f"{org.name}: LOCATED_AT -> {org.siege_libelle_commune or 'Unknown'}" ) diff --git a/flowsint-transforms/src/flowsint_transforms/phone/to_infos.py b/flowsint-transforms/src/flowsint_transforms/phone/to_infos.py index ca49d60..202b455 100644 --- a/flowsint-transforms/src/flowsint_transforms/phone/to_infos.py +++ b/flowsint-transforms/src/flowsint_transforms/phone/to_infos.py @@ -10,7 +10,7 @@ import httpx class IgnorantTransform(Transform): # Define types as class attributes - base class handles schema generation automatically - InputType = List[str] # Phone numbers as strings + InputType = List[Phone] # Phone objects OutputType = List[Dict[str, Any]] # Results as dictionaries @classmethod @@ -25,43 +25,29 @@ class IgnorantTransform(Transform): def key(cls) -> str: return "number" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for number in data: - phone_obj = None - if isinstance(number, str): - phone_obj = Phone(number=number) - elif isinstance(number, dict) and "number" in number: - phone_obj = Phone(number=number["number"]) - elif isinstance(number, Phone): - phone_obj = number - if phone_obj: - cleaned.append(phone_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: """ Performs the Ignorant search for each specified phone number. """ results: OutputType = [] - for phone in data: + for phone_obj in data: try: - cleaned_phone = is_valid_number(phone) + cleaned_phone = is_valid_number(phone_obj.number) if cleaned_phone: result = await self._perform_ignorant_research(cleaned_phone) results.append(result) else: - results.append({"number": phone, "error": "Invalid phone number"}) + results.append({"number": phone_obj.number, "error": "Invalid phone number"}) except Exception as e: results.append( { - "number": phone, + "number": phone_obj.number, "error": f"Unexpected error in Ignorant scan: {str(e)}", } ) Logger.error( self.sketch_id, - {"message": f"Error scanning phone {phone}: {str(e)}"}, + {"message": f"Error scanning phone {phone_obj.number}: {str(e)}"}, ) return results @@ -100,13 +86,8 @@ class IgnorantTransform(Transform): for result in results: if "error" not in result and "platforms" in result: - self.create_node( - "phone", - "number", - result["number"], - caption=result["number"], - type="phone", - ) + phone_obj = Phone(number=result["number"]) + self.create_node(phone_obj) # Create platform relationships for platform_result in result["platforms"]: diff --git a/flowsint-transforms/src/flowsint_transforms/phone/to_leaks.py b/flowsint-transforms/src/flowsint_transforms/phone/to_leaks.py deleted file mode 100644 index a75a856..0000000 --- a/flowsint-transforms/src/flowsint_transforms/phone/to_leaks.py +++ /dev/null @@ -1,228 +0,0 @@ -import os -from typing import Any, Dict, List, Optional, Union -import requests -from urllib.parse import urljoin -from flowsint_core.core.transform_base import Transform -from flowsint_core.core.logger import Logger -from flowsint_types.phone import Phone -from flowsint_types.breach import Breach -from dotenv import load_dotenv -from flowsint_core.core.graph_db import Neo4jConnection - -# Load environment variables -load_dotenv() - -HIBP_API_KEY = os.getenv("HIBP_API_KEY") - - -class PhoneToBreachesTransform(Transform): - """[HIBPWNED] Get the breaches the phone number might be invovled in.""" - - InputType = List[Phone] - OutputType = List[tuple] # List of (phone, breach) tuples - - def __init__( - self, - sketch_id: Optional[str] = None, - scan_id: Optional[str] = None, - neo4j_conn: Optional[Neo4jConnection] = None, - vault=None, - params: Optional[Dict[str, Any]] = None, - ): - super().__init__( - sketch_id=sketch_id, - scan_id=scan_id, - neo4j_conn=neo4j_conn, - params_schema=self.get_params_schema(), - vault=vault, - params=params, - ) - - @classmethod - def name(cls) -> str: - return "phone_to_breaches" - - @classmethod - def category(cls) -> str: - return "Email" - - @classmethod - def key(cls) -> str: - return "phone" - - @classmethod - def required_params(cls) -> bool: - return True - - @classmethod - def get_params_schema(cls) -> List[Dict[str, Any]]: - """Declare required parameters for this transform""" - return [ - { - "name": "HIBP_API_KEY", - "type": "vaultSecret", - "description": "The HIBP API key to use for breaches lookup.", - "required": True, - }, - { - "name": "HIBP_API_URL", - "type": "url", - "description": "The HIBP API URL to use for breaches lookup.", - "required": False, - "default": "https://haveibeenpwned.com/api/v3/breachedaccount/", - }, - ] - - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for number in data: - phone_obj = None - if isinstance(number, str): - phone_obj = Phone(number=number) - elif isinstance(number, dict) and "number" in number: - phone_obj = Phone(number=number["number"]) - elif isinstance(number, Phone): - phone_obj = number - if phone_obj: - cleaned.append(phone_obj) - return cleaned - - async def scan(self, data: InputType) -> OutputType: - results: OutputType = [] - api_key = self.get_secret("HIBP_API_KEY", os.getenv("HIBP_API_KEY")) - api_url = self.get_params().get("HIBP_API_URL", "https://haveibeenpwned.com/api/v3/breachedaccount/") - headers = {"hibp-api-key": api_key, "User-Agent": "FlowsInt-Transform"} - Logger.info(self.sketch_id, {"message": f"HIBP API URL: {api_url}"}) - for phone in data: - try: - # Query Have I Been Pwned API - full_url = urljoin(api_url, f"{phone.number}?truncateResponse=false") - Logger.error(self.sketch_id, {"message": f"full url: {full_url}"}) - response = requests.get(full_url, headers=headers, timeout=10) - Logger.info( - self.sketch_id, {"message": f"HIBP API response: {response.json()}"} - ) - if response.status_code == 200: - breaches_data = response.json() - Logger.info( - self.sketch_id, - { - "message": f"Found {len(breaches_data)} breaches for {phone.number}" - }, - ) - for breach_data in breaches_data: - breach = Breach( - name=breach_data.get("Name", ""), - title=breach_data.get("Title", ""), - domain=breach_data.get("Domain", ""), - breachdate=breach_data.get("BreachDate", ""), - addeddate=breach_data.get("AddedDate", ""), - modifieddate=breach_data.get("ModifiedDate", ""), - pwncount=breach_data.get("PwnCount", 0), - description=breach_data.get("Description", ""), - dataclasses=breach_data.get("DataClasses", []), - isverified=breach_data.get("IsVerified", False), - isfabricated=breach_data.get("IsFabricated", False), - issensitive=breach_data.get("IsSensitive", False), - isretired=breach_data.get("IsRetired", False), - isspamlist=breach_data.get("IsSpamList", False), - logopath=breach_data.get("LogoPath", ""), - ) - # Store phone and breach as a tuple - results.append((phone.number, breach)) - Logger.info( - self.sketch_id, - { - "message": f"Added breach: {breach.name} for phone: {phone.number}" - }, - ) - - elif response.status_code == 404: - # No breaches found for this phone - Logger.info( - self.sketch_id, - {"message": f"No breaches found for phone {phone.number}"}, - ) - continue - - else: - Logger.error( - self.sketch_id, - { - "message": f"HIBP API error for {phone.number}: {response.status_code}" - }, - ) - continue - - except Exception as e: - Logger.error( - self.sketch_id, - { - "message": f"Error checking breaches for phone {phone.number}: {e}" - }, - ) - continue - - Logger.info( - self.sketch_id, - {"message": f"Scan completed. Total results: {len(results)}"}, - ) - return results - - def postprocess(self, results: OutputType, original_input: InputType) -> OutputType: - Logger.info( - self.sketch_id, - { - "message": f"Postprocess started. Results count: {len(results)}, Original input count: {len(original_input)}" - }, - ) - - # Create phone nodes first - for phone_obj in original_input: - if not self.neo4j_conn: - continue - # Create phone node - self.create_node("phone", "phone", phone_obj.number, **phone_obj.__dict__) - Logger.info( - self.sketch_id, {"message": f"Created phone node: {phone_obj.number}"} - ) - - # Process all breaches - for phone, breach_obj in results: - if not self.neo4j_conn: - continue - - # Create breach node - breach_key = f"{breach_obj.name}_{self.sketch_id}" - self.create_node( - "breach", - "breach_id", - breach_key, - **breach_obj.dict(), - label=breach_obj.name, - type="breach", - ) - Logger.info( - self.sketch_id, {"message": f"Created breach node: {breach_key}"} - ) - - # Create relationship between the specific phone and this breach - self.create_relationship( - "phone", - "number", - phone, - "breach", - "breach_id", - breach_key, - "FOUND_IN_BREACH", - ) - self.log_graph_message( - f"Breach found for phone {phone} -> {breach_obj.name} ({breach_obj.title})" - ) - - return results - - -# Make types available at module level for easy access -InputType = PhoneToBreachesTransform.InputType -OutputType = PhoneToBreachesTransform.OutputType diff --git a/flowsint-transforms/src/flowsint_transforms/social/to_maigret.py b/flowsint-transforms/src/flowsint_transforms/social/to_maigret.py index e288aab..67af127 100644 --- a/flowsint-transforms/src/flowsint_transforms/social/to_maigret.py +++ b/flowsint-transforms/src/flowsint_transforms/social/to_maigret.py @@ -1,8 +1,7 @@ import json import subprocess from pathlib import Path -from typing import List, Union -from flowsint_core.utils import is_valid_username +from typing import List from flowsint_core.core.transform_base import Transform from flowsint_types import Username from flowsint_types.social_account import SocialAccount @@ -30,25 +29,6 @@ class MaigretTransform(Transform): def key(cls) -> str: return "username" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - obj = None - if isinstance(item, str): - obj = Username(value=item) - elif isinstance(item, dict) and "username" in item: - obj = Username(value=item["username"]) - elif isinstance(item, dict) and "value" in item: - obj = Username(value=item["value"]) - elif isinstance(item, SocialAccount): - obj = Username(value=item.username.value) - obj = item.username.value - elif isinstance(item, Username): - obj = item - if obj and obj.value and is_valid_username(obj.value): - cleaned.append(obj) - return cleaned - def run_maigret(self, username: str) -> Path: output_file = Path(f"/tmp/report_{username}_simple.json") try: @@ -144,43 +124,17 @@ class MaigretTransform(Transform): for profile in results: # Create social profile node - self.create_node( - "social_account", - "platform", - profile.profile_url, - username=profile.username.value, - platform=profile.platform, - profile_picture_url=profile.profile_picture_url, - bio=profile.bio, - followers_count=profile.followers_count, - following_count=profile.following_count, - posts_count=profile.posts_count, - label=f"{profile.username.value}", - type="social_account", - ) + self.create_node(profile.username) + self.create_node(profile) # Create username node - self.create_node( - "username", "value", profile.username.value, **profile.username.__dict__ - ) # Create relationship - self.create_relationship( - "username", - "value", - profile.username.value, - "social_account", - "platform", - profile.platform, - "HAS_SOCIAL_ACCOUNT", - ) - + self.create_relationship(profile.username, profile, "HAS_SOCIAL_ACCOUNT") self.log_graph_message( f"{profile.username.value} -> account found on {profile.platform}" ) - return results -# Make types available at module level for easy access InputType = MaigretTransform.InputType OutputType = MaigretTransform.OutputType diff --git a/flowsint-transforms/src/flowsint_transforms/social/to_sherlock.py b/flowsint-transforms/src/flowsint_transforms/social/to_sherlock.py index e90930d..a2b4aa8 100644 --- a/flowsint-transforms/src/flowsint_transforms/social/to_sherlock.py +++ b/flowsint-transforms/src/flowsint_transforms/social/to_sherlock.py @@ -26,25 +26,6 @@ class SherlockTransform(Transform): def key(cls) -> str: return "username" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - obj = None - if isinstance(item, str): - obj = Username(value=item) - elif isinstance(item, dict) and "username" in item: - obj = Username(value=item["username"]) - elif isinstance(item, dict) and "value" in item: - obj = Username(value=item["value"]) - elif isinstance(item, SocialAccount): - obj = Username(value=item.username.value) - obj = item.username.value - elif isinstance(item, Username): - obj = item - if obj and obj.value and is_valid_username(obj.value): - cleaned.append(obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: """Performs the scan using Sherlock on the list of usernames.""" results: OutputType = [] @@ -113,15 +94,7 @@ class SherlockTransform(Transform): return results for social_account in results: - self.create_node( - "social", - "username", - social_account.username.value, - platform=social_account.platform, - url=social_account.url, - caption=social_account.platform, - type="social", - ) + self.create_node(social_account) self.log_graph_message( f"Found social account: {social_account.username.value} on {social_account.platform}" ) diff --git a/flowsint-transforms/src/flowsint_transforms/website/to_crawler.py b/flowsint-transforms/src/flowsint_transforms/website/to_crawler.py index 67e8604..72ffab1 100644 --- a/flowsint-transforms/src/flowsint_transforms/website/to_crawler.py +++ b/flowsint-transforms/src/flowsint_transforms/website/to_crawler.py @@ -43,20 +43,6 @@ class WebsiteToCrawler(Transform): except Exception: return False - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - website_obj = None - if isinstance(item, str): - website_obj = Website(url=item) - elif isinstance(item, dict) and "url" in item: - website_obj = Website(url=item["url"]) - elif isinstance(item, Website): - website_obj = item - if website_obj: - cleaned.append(website_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: """Crawl websites to extract emails and phone numbers.""" results = [] @@ -155,46 +141,22 @@ class WebsiteToCrawler(Transform): # Create website node if self.neo4j_conn: - self.create_node( - "website", "url", website_url, caption=website_url, type="website" - ) + self.create_node(input_website) # Create email nodes and relationships for email in result["emails"]: - self.create_node( - "email", "email", email.email, caption=email.email, type="email" - ) - self.create_relationship( - "website", - "url", - website_url, - "email", - "email", - email.email, - "HAS_EMAIL", - ) + self.create_node(email) + website_obj = Website(url=website_url) + self.create_relationship(website_obj, email, "HAS_EMAIL") self.log_graph_message( f"Found email {email.email} for website {website_url}" ) # Create phone nodes and relationships for phone in result["phones"]: - self.create_node( - "phone", - "number", - phone.number, - caption=phone.number, - type="phone", - ) - self.create_relationship( - "website", - "url", - website_url, - "phone", - "number", - phone.number, - "HAS_PHONE", - ) + self.create_node(phone) + website_obj2 = Website(url=website_url) + self.create_relationship(website_obj2, phone, "HAS_PHONE") self.log_graph_message( f"Found phone {phone.number} for website {website_url}" ) diff --git a/flowsint-transforms/src/flowsint_transforms/website/to_domain.py b/flowsint-transforms/src/flowsint_transforms/website/to_domain.py index cb40032..ff0eda2 100644 --- a/flowsint-transforms/src/flowsint_transforms/website/to_domain.py +++ b/flowsint-transforms/src/flowsint_transforms/website/to_domain.py @@ -25,20 +25,6 @@ class WebsiteToDomainTransform(Transform): def key(cls) -> str: return "website" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - website_obj = None - if isinstance(item, str): - website_obj = Website(url=item) - elif isinstance(item, dict) and "url" in item: - website_obj = Website(url=item["url"]) - elif isinstance(item, Website): - website_obj = item - if website_obj: - cleaned.append(website_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] for website in data: @@ -78,31 +64,11 @@ class WebsiteToDomainTransform(Transform): website_url = str(input_website.url) domain_name = result.domain - self.create_node( - "website", - "url", - website_url, - caption=website_url, - type="website", - ) - + self.create_node(input_website) + # Create relationship with the specific domain for this website - self.create_node( - "domain", - "domain", - domain_name, - caption=domain_name, - type="domain", - ) - self.create_relationship( - "website", - "url", - website_url, - "domain", - "domain", - domain_name, - "HAS_DOMAIN", - ) + self.create_node(result) + self.create_relationship(input_website, result, "HAS_DOMAIN") self.log_graph_message( f"Extracted domain {domain_name} from website {website_url}." ) diff --git a/flowsint-transforms/src/flowsint_transforms/website/to_links.py b/flowsint-transforms/src/flowsint_transforms/website/to_links.py index f31e3f3..77164d7 100644 --- a/flowsint-transforms/src/flowsint_transforms/website/to_links.py +++ b/flowsint-transforms/src/flowsint_transforms/website/to_links.py @@ -2,6 +2,7 @@ from typing import List, Union from urllib.parse import urlparse from flowsint_core.core.transform_base import Transform from flowsint_types.website import Website +from flowsint_types.domain import Domain from flowsint_core.core.logger import Logger from reconspread import Crawler @@ -33,20 +34,6 @@ class WebsiteToLinks(Transform): except Exception: return "" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - website_obj = None - if isinstance(item, str): - website_obj = Website(url=item) - elif isinstance(item, dict) and "url" in item: - website_obj = Website(url=item["url"]) - elif isinstance(item, Website): - website_obj = item - if website_obj: - cleaned.append(website_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: """Crawl websites using reconspread to extract internal and external links.""" results = [] @@ -63,30 +50,11 @@ class WebsiteToLinks(Transform): # Create main website and domain nodes upfront if self.neo4j_conn: - self.create_node( - "website", - "url", - str(website.url), - caption=str(website.url), - type="website", - ) + self.create_node(website) if main_domain: - self.create_node( - "domain", - "name", - main_domain, - caption=main_domain, - type="domain", - ) - self.create_relationship( - "website", - "url", - str(website.url), - "domain", - "name", - main_domain, - "BELONGS_TO_DOMAIN", - ) + domain_obj = Domain(domain=main_domain) + self.create_node(domain_obj) + self.create_relationship(website, domain_obj, "BELONGS_TO_DOMAIN") self.log_graph_message( f"Website {str(website.url)} belongs to domain {main_domain}" ) @@ -105,49 +73,20 @@ class WebsiteToLinks(Transform): external_domains.add(domain) # Create external website node immediately if self.neo4j_conn: - self.create_node( - "website", "url", url, caption=url, type="website" - ) - self.create_relationship( - "website", - "url", - str(website.url), - "website", - "url", - url, - "LINKS_TO", - ) + url_obj = Website(url=url) + self.create_node(url_obj) + self.create_relationship(website, url_obj, "LINKS_TO") self.log_graph_message( f"Website {str(website.url)} links to external website {url}" ) # Create external domain node and link external website to its domain if domain != main_domain: - self.create_node( - "domain", - "name", - domain, - caption=domain, - type="domain", - ) - self.create_relationship( - "website", - "url", - url, - "domain", - "name", - domain, - "BELONGS_TO_DOMAIN", - ) - self.create_relationship( - "website", - "url", - str(website.url), - "domain", - "name", - domain, - "LINKS_TO_DOMAIN", - ) + domain_obj_ext = Domain(domain=domain) + self.create_node(domain_obj_ext) + self.create_relationship(url_obj, domain_obj_ext, "BELONGS_TO_DOMAIN") + domain_obj_main = Domain(domain=main_domain) + self.create_relationship(domain_obj_main, domain_obj_ext, "LINKS_TO") self.log_graph_message( f"External website {url} belongs to domain {domain}" ) @@ -164,33 +103,17 @@ class WebsiteToLinks(Transform): if self.neo4j_conn and url != str( website.url ): # Don't create duplicate of main website - self.create_node( - "website", "url", url, caption=url, type="website" - ) - self.create_relationship( - "website", - "url", - str(website.url), - "website", - "url", - url, - "LINKS_TO", - ) + internal_website = Website(url=url) + self.create_node(internal_website) + self.create_relationship(website, internal_website, "LINKS_TO") self.log_graph_message( f"Website {str(website.url)} links to internal website {url}" ) # Also link internal websites to main domain if main_domain: - self.create_relationship( - "website", - "url", - url, - "domain", - "name", - main_domain, - "BELONGS_TO_DOMAIN", - ) + domain_obj_int = Domain(domain=main_domain) + self.create_relationship(internal_website, domain_obj_int, "BELONGS_TO_DOMAIN") Logger.info( self.sketch_id, {"message": f"[INTERNAL] Found: {url}"} ) @@ -256,30 +179,11 @@ class WebsiteToLinks(Transform): # Still create main website and domain nodes even on error main_domain = self.extract_domain(str(website.url)) if self.neo4j_conn: - self.create_node( - "website", - "url", - str(website.url), - caption=str(website.url), - type="website", - ) + self.create_node(website) if main_domain: - self.create_node( - "domain", - "name", - main_domain, - caption=main_domain, - type="domain", - ) - self.create_relationship( - "website", - "url", - str(website.url), - "domain", - "name", - main_domain, - "BELONGS_TO_DOMAIN", - ) + domain_obj_err = Domain(domain=main_domain) + self.create_node(domain_obj_err) + self.create_relationship(website, domain_obj_err, "BELONGS_TO_DOMAIN") self.log_graph_message( f"Website {str(website.url)} belongs to domain {main_domain}" ) diff --git a/flowsint-transforms/src/flowsint_transforms/website/to_text.py b/flowsint-transforms/src/flowsint_transforms/website/to_text.py index a23757a..ae4918a 100644 --- a/flowsint-transforms/src/flowsint_transforms/website/to_text.py +++ b/flowsint-transforms/src/flowsint_transforms/website/to_text.py @@ -24,20 +24,6 @@ class WebsiteToText(Transform): def key(cls) -> str: return "website" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - website_obj = None - if isinstance(item, str): - website_obj = Website(url=item) - elif isinstance(item, dict) and "url" in item: - website_obj = Website(url=item["url"]) - elif isinstance(item, Website): - website_obj = item - if website_obj: - cleaned.append(website_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] for website in data: @@ -67,31 +53,11 @@ class WebsiteToText(Transform): website_url = str(input_website.url) if self.neo4j_conn: - self.create_node( - "website", - "url", - str(website_url), - caption=str(website_url), - type="website", - ) + self.create_node(input_website) # Create relationship with the specific phrase for this website - self.create_node( - "phrase", - "text", - result.text, - caption=result.text, - type="phrase", - ) - self.create_relationship( - "website", - "url", - website_url, - "phrase", - "text", - result.text, - "HAS_INNER_TEXT", - ) + self.create_node(result) + self.create_relationship(input_website, result, "HAS_INNER_TEXT") self.log_graph_message( f"Extracted some text from the website {website_url}." ) diff --git a/flowsint-transforms/src/flowsint_transforms/website/to_webtrackers.py b/flowsint-transforms/src/flowsint_transforms/website/to_webtrackers.py index 4ddefaa..338b407 100644 --- a/flowsint-transforms/src/flowsint_transforms/website/to_webtrackers.py +++ b/flowsint-transforms/src/flowsint_transforms/website/to_webtrackers.py @@ -39,20 +39,6 @@ class WebsiteToWebtrackersTransform(Transform): def key(cls) -> str: return "website" - def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType: - cleaned: InputType = [] - for item in data: - website_obj = None - if isinstance(item, str): - website_obj = Website(url=item) - elif isinstance(item, dict) and "url" in item: - website_obj = Website(url=item["url"]) - elif isinstance(item, Website): - website_obj = item - if website_obj: - cleaned.append(website_obj) - return cleaned - async def scan(self, data: InputType) -> OutputType: results: OutputType = [] @@ -97,29 +83,14 @@ class WebsiteToWebtrackersTransform(Transform): # Create nodes and relationships for each website and its trackers for website_url, trackers in website_trackers.items(): - # Create website node - self.create_node( - "website", "url", website_url, caption=website_url, type="website" - ) + # Create website node (we don't have the website object here, so keep minimal) + self.create_node(Website(url=website_url)) # Create tracker nodes and relationships for tracker in trackers: - self.create_node( - "webtracker", - "tracker_id", - tracker.tracker_id, - caption=tracker.name, - type="webtracker" - ) - self.create_relationship( - "website", - "url", - website_url, - "webtracker", - "tracker_id", - tracker.tracker_id, - "HAS_TRACKER", - ) + self.create_node(tracker) + website_obj = Website(url=website_url) + self.create_relationship(website_obj, tracker, "HAS_TRACKER") self.log_graph_message( f"Found tracker {tracker.name} ({tracker.tracker_id}) for website {website_url}" ) diff --git a/flowsint-transforms/tests/README.md b/flowsint-transforms/tests/README.md index de70f0f..ab47323 100644 --- a/flowsint-transforms/tests/README.md +++ b/flowsint-transforms/tests/README.md @@ -1,6 +1,6 @@ -# flowsint-api tests +# flowsint-transforms tests -Run the tests. +Run the tests (make sure you are in `flowsint_tranforms` folder). ```bash python -m pytest tests/ -v --tb=short diff --git a/flowsint-transforms/tests/insert_command.py b/flowsint-transforms/tests/insert_command.py deleted file mode 100644 index 4df5a03..0000000 --- a/flowsint-transforms/tests/insert_command.py +++ /dev/null @@ -1,28 +0,0 @@ -import sys -import os -import asyncio - -if __name__ == "__main__": - sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) - -from flowsint_types.domain import Domain -from flowsint_types.ip import Ip -from flowsint_transforms.domains.resolve import ResolveTransform - - -async def main(): - # Create test data - domains = [Domain(domain="adaltas.com")] - ips = [Ip(address="12.23.34.45"), Ip(address="56.67.78.89")] - - # Test the transform - transform = ResolveTransform("sketch_123", "scan_123") - - # Test the new KISS postprocess method - transform.postprocess(ips[:1], domains) # Only use first IP to match domains length - - print("Postprocess test completed successfully!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/flowsint-transforms/tests/transforms/domain/resolve.py b/flowsint-transforms/tests/transforms/domain/resolve.py index bccf010..f7f1f3a 100644 --- a/flowsint-transforms/tests/transforms/domain/resolve.py +++ b/flowsint-transforms/tests/transforms/domain/resolve.py @@ -1,4 +1,4 @@ -from flowsint_transforms.domains.resolve import ResolveTransform +from flowsint_transforms.domain.to_ip import ResolveTransform from flowsint_types.domain import Domain from flowsint_types.ip import Ip from typing import List diff --git a/flowsint-transforms/tests/transforms/domain/test_to_asn_preprocess.py b/flowsint-transforms/tests/transforms/domain/test_to_asn_preprocess.py new file mode 100644 index 0000000..7e89511 --- /dev/null +++ b/flowsint-transforms/tests/transforms/domain/test_to_asn_preprocess.py @@ -0,0 +1,98 @@ +import pytest +from flowsint_transforms.domain.to_asn import DomainToAsnTransform +from flowsint_types.domain import Domain + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return DomainToAsnTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Domain objects.""" + inputs = [ + Domain(domain="example.com"), + Domain(domain="google.com"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["example.com", "google.com"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"domain": "example.com"}, + {"domain": "google.com"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Domain(domain="example.com"), + "not a domain", # Invalid + Domain(domain="google.com"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Domain(domain="example.com"), # Object + "google.com", # String + {"domain": "github.com"}, # Dict + {"invalid_key": "wrong.com"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + assert result[2].domain == "github.com" + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] + + +def test_preprocess_all_invalid(transform): + """Test preprocess when all items are invalid.""" + inputs = [ + "not a domain", + "domain with spaces", + "", + ] + result = transform.preprocess(inputs) + + assert len(result) == 0 diff --git a/flowsint-transforms/tests/transforms/domain/test_to_ip_preprocess.py b/flowsint-transforms/tests/transforms/domain/test_to_ip_preprocess.py new file mode 100644 index 0000000..55b342e --- /dev/null +++ b/flowsint-transforms/tests/transforms/domain/test_to_ip_preprocess.py @@ -0,0 +1,83 @@ +import pytest +from flowsint_transforms.domain.to_ip import ResolveTransform +from flowsint_types.domain import Domain + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return ResolveTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Domain objects.""" + inputs = [ + Domain(domain="example.com"), + Domain(domain="google.com"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["example.com", "google.com"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"domain": "example.com"}, + {"domain": "google.com"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Domain(domain="example.com"), + "invalid domain with spaces", # Invalid + Domain(domain="google.com"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Domain(domain="example.com"), # Object + "google.com", # String + {"domain": "github.com"}, # Dict + {"invalid_key": "wrong.com"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Domain) for item in result) + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] diff --git a/flowsint-transforms/tests/transforms/domain/test_to_root_domain_preprocess.py b/flowsint-transforms/tests/transforms/domain/test_to_root_domain_preprocess.py new file mode 100644 index 0000000..a55678e --- /dev/null +++ b/flowsint-transforms/tests/transforms/domain/test_to_root_domain_preprocess.py @@ -0,0 +1,83 @@ +import pytest +from flowsint_transforms.domain.to_root_domain import DomainToRootDomain +from flowsint_types.domain import Domain + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return DomainToRootDomain(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Domain objects.""" + inputs = [ + Domain(domain="subdomain.example.com"), + Domain(domain="www.google.com"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "subdomain.example.com" + assert result[1].domain == "www.google.com" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["subdomain.example.com", "www.google.com"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "subdomain.example.com" + assert result[1].domain == "www.google.com" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"domain": "subdomain.example.com"}, + {"domain": "www.google.com"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Domain) for item in result) + assert result[0].domain == "subdomain.example.com" + assert result[1].domain == "www.google.com" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Domain(domain="example.com"), + "not a domain", # Invalid + Domain(domain="google.com"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].domain == "example.com" + assert result[1].domain == "google.com" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Domain(domain="subdomain.example.com"), # Object + "www.google.com", # String + {"domain": "api.github.com"}, # Dict + {"invalid_key": "wrong.com"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Domain) for item in result) + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] diff --git a/flowsint-transforms/tests/transforms/email/test_to_gravatar_preprocess.py b/flowsint-transforms/tests/transforms/email/test_to_gravatar_preprocess.py new file mode 100644 index 0000000..c214f39 --- /dev/null +++ b/flowsint-transforms/tests/transforms/email/test_to_gravatar_preprocess.py @@ -0,0 +1,98 @@ +import pytest +from flowsint_transforms.email.to_gravatar import EmailToGravatarTransform +from flowsint_types.email import Email + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return EmailToGravatarTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Email objects.""" + inputs = [ + Email(email="test@example.com"), + Email(email="user@domain.com"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Email) for item in result) + assert result[0].email == "test@example.com" + assert result[1].email == "user@domain.com" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["test@example.com", "user@domain.com"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Email) for item in result) + assert result[0].email == "test@example.com" + assert result[1].email == "user@domain.com" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"email": "test@example.com"}, + {"email": "user@domain.com"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Email) for item in result) + assert result[0].email == "test@example.com" + assert result[1].email == "user@domain.com" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Email(email="test@example.com"), + "not-an-email", # Invalid + Email(email="user@domain.com"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].email == "test@example.com" + assert result[1].email == "user@domain.com" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Email(email="test@example.com"), # Object + "user@domain.com", # String + {"email": "admin@site.org"}, # Dict + {"invalid_key": "wrong@example.com"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Email) for item in result) + assert result[0].email == "test@example.com" + assert result[1].email == "user@domain.com" + assert result[2].email == "admin@site.org" + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] + + +def test_preprocess_all_invalid(transform): + """Test preprocess when all items are invalid.""" + inputs = [ + "not-an-email", + "missing@", + "invalid", + ] + result = transform.preprocess(inputs) + + assert len(result) == 0 diff --git a/flowsint-transforms/tests/transforms/ip/test_to_asn_preprocess.py b/flowsint-transforms/tests/transforms/ip/test_to_asn_preprocess.py new file mode 100644 index 0000000..2577e4b --- /dev/null +++ b/flowsint-transforms/tests/transforms/ip/test_to_asn_preprocess.py @@ -0,0 +1,83 @@ +import pytest +from flowsint_transforms.ip.to_asn import IpToAsnTransform +from flowsint_types.ip import Ip + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return IpToAsnTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Ip objects.""" + inputs = [ + Ip(address="8.8.8.8"), + Ip(address="1.1.1.1"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Ip) for item in result) + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["8.8.8.8", "1.1.1.1"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Ip) for item in result) + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"address": "8.8.8.8"}, + {"address": "1.1.1.1"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Ip) for item in result) + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Ip(address="8.8.8.8"), + "not-an-ip", # Invalid + Ip(address="1.1.1.1"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Ip(address="8.8.8.8"), # Object + "1.1.1.1", # String + {"address": "192.168.1.1"}, # Dict + {"invalid_key": "10.0.0.1"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Ip) for item in result) + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] diff --git a/flowsint-transforms/tests/transforms/ip/test_to_infos_preprocess.py b/flowsint-transforms/tests/transforms/ip/test_to_infos_preprocess.py new file mode 100644 index 0000000..db43ad6 --- /dev/null +++ b/flowsint-transforms/tests/transforms/ip/test_to_infos_preprocess.py @@ -0,0 +1,98 @@ +import pytest +from flowsint_transforms.ip.to_infos import IpToInfosTransform +from flowsint_types.ip import Ip + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return IpToInfosTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Ip objects.""" + inputs = [ + Ip(address="8.8.8.8"), + Ip(address="1.1.1.1"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Ip) for item in result) + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["8.8.8.8", "1.1.1.1"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Ip) for item in result) + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"address": "8.8.8.8"}, + {"address": "1.1.1.1"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Ip) for item in result) + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Ip(address="8.8.8.8"), + "999.999.999.999", # Invalid + Ip(address="1.1.1.1"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Ip(address="8.8.8.8"), # Object + "1.1.1.1", # String + {"address": "192.168.1.1"}, # Dict + {"invalid_key": "10.0.0.1"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Ip) for item in result) + assert result[0].address == "8.8.8.8" + assert result[1].address == "1.1.1.1" + assert result[2].address == "192.168.1.1" + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] + + +def test_preprocess_all_invalid(transform): + """Test preprocess when all items are invalid.""" + inputs = [ + "not-an-ip", + "999.999.999.999", + "invalid", + ] + result = transform.preprocess(inputs) + + assert len(result) == 0 diff --git a/flowsint-transforms/tests/transforms/organization/test_to_asn_preprocess.py b/flowsint-transforms/tests/transforms/organization/test_to_asn_preprocess.py new file mode 100644 index 0000000..6b103af --- /dev/null +++ b/flowsint-transforms/tests/transforms/organization/test_to_asn_preprocess.py @@ -0,0 +1,87 @@ +import pytest +from flowsint_transforms.organization.to_asn import OrgToAsnTransform +from flowsint_types.organization import Organization + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return OrgToAsnTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Organization objects.""" + inputs = [ + Organization(name="Acme Corp"), + Organization(name="Google LLC"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Organization) for item in result) + assert result[0].name == "Acme Corp" + assert result[1].name == "Google LLC" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["Acme Corp", "Google LLC"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Organization) for item in result) + assert result[0].name == "Acme Corp" + assert result[1].name == "Google LLC" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"name": "Acme Corp"}, + {"name": "Google LLC"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Organization) for item in result) + assert result[0].name == "Acme Corp" + assert result[1].name == "Google LLC" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Organization(name="Acme Corp"), # Object + "Google LLC", # String + {"name": "Microsoft Corporation"}, # Dict + {"invalid_key": "Apple Inc"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Organization) for item in result) + assert result[0].name == "Acme Corp" + assert result[1].name == "Google LLC" + assert result[2].name == "Microsoft Corporation" + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] + + +def test_preprocess_with_additional_fields(transform): + """Test preprocess with dicts containing additional fields.""" + inputs = [ + {"name": "Acme Corp", "nom_complet": "Acme Corporation"}, + {"name": "Google LLC", "siren": "123456789"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Organization) for item in result) + assert result[0].name == "Acme Corp" + assert result[0].nom_complet == "Acme Corporation" + assert result[1].name == "Google LLC" + assert result[1].siren == "123456789" diff --git a/flowsint-transforms/tests/transforms/phone/test_to_infos_preprocess.py b/flowsint-transforms/tests/transforms/phone/test_to_infos_preprocess.py new file mode 100644 index 0000000..43b085d --- /dev/null +++ b/flowsint-transforms/tests/transforms/phone/test_to_infos_preprocess.py @@ -0,0 +1,95 @@ +import pytest +from flowsint_transforms.phone.to_infos import IgnorantTransform +from flowsint_types.phone import Phone + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return IgnorantTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Phone objects.""" + inputs = [ + Phone(number="+33612345678"), + Phone(number="+14155552671"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Phone) for item in result) + assert result[0].number == "+33612345678" + assert result[1].number == "+14155552671" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["+33612345678", "+14155552671"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Phone) for item in result) + assert result[0].number == "+33612345678" + assert result[1].number == "+14155552671" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"number": "+33612345678"}, + {"number": "+14155552671"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Phone) for item in result) + assert result[0].number == "+33612345678" + assert result[1].number == "+14155552671" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Phone(number="+33612345678"), + "123", # Invalid (too short) + Phone(number="+14155552671"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].number == "+33612345678" + assert result[1].number == "+14155552671" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Phone(number="+33612345678"), # Object + "+14155552671", # String + {"number": "+447911123456"}, # Dict + {"invalid_key": "+15555555555"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Phone) for item in result) + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] + + +def test_preprocess_all_invalid(transform): + """Test preprocess when all items are invalid.""" + inputs = [ + "123", + "not-a-phone", + "", + ] + result = transform.preprocess(inputs) + + assert len(result) == 0 diff --git a/flowsint-transforms/tests/transforms/social/test_to_maigret_preprocess.py b/flowsint-transforms/tests/transforms/social/test_to_maigret_preprocess.py new file mode 100644 index 0000000..8f349d6 --- /dev/null +++ b/flowsint-transforms/tests/transforms/social/test_to_maigret_preprocess.py @@ -0,0 +1,111 @@ +import pytest +from flowsint_transforms.social.to_maigret import MaigretTransform +from flowsint_types.username import Username + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return MaigretTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Username objects.""" + inputs = [ + Username(value="john_doe"), + Username(value="user123"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Username) for item in result) + assert result[0].value == "john_doe" + assert result[1].value == "user123" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["john_doe", "user123"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Username) for item in result) + assert result[0].value == "john_doe" + assert result[1].value == "user123" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"value": "john_doe"}, + {"value": "user123"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Username) for item in result) + assert result[0].value == "john_doe" + assert result[1].value == "user123" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Username(value="john_doe"), + "ab", # Invalid (too short, must be 3-30 chars) + Username(value="user123"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].value == "john_doe" + assert result[1].value == "user123" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Username(value="john_doe"), # Object + "user123", # String + {"value": "alice_2023"}, # Dict + {"invalid_key": "bob_smith"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Username) for item in result) + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] + + +def test_preprocess_all_invalid(transform): + """Test preprocess when all items are invalid.""" + inputs = [ + "ab", # Too short + "user with spaces", # Invalid characters + "", # Empty + ] + result = transform.preprocess(inputs) + + assert len(result) == 0 + + +def test_preprocess_with_platform(transform): + """Test preprocess with username including platform.""" + inputs = [ + {"value": "john_doe", "platform": "twitter"}, + {"value": "user123", "platform": "github"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Username) for item in result) + assert result[0].value == "john_doe" + assert result[0].platform == "twitter" + assert result[1].value == "user123" + assert result[1].platform == "github" diff --git a/flowsint-transforms/tests/transforms/social/test_to_sherlock_preprocess.py b/flowsint-transforms/tests/transforms/social/test_to_sherlock_preprocess.py new file mode 100644 index 0000000..a63f685 --- /dev/null +++ b/flowsint-transforms/tests/transforms/social/test_to_sherlock_preprocess.py @@ -0,0 +1,95 @@ +import pytest +from flowsint_transforms.social.to_sherlock import SherlockTransform +from flowsint_types.username import Username + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return SherlockTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Username objects.""" + inputs = [ + Username(value="john_doe"), + Username(value="user123"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Username) for item in result) + assert result[0].value == "john_doe" + assert result[1].value == "user123" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["john_doe", "user123"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Username) for item in result) + assert result[0].value == "john_doe" + assert result[1].value == "user123" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"value": "john_doe"}, + {"value": "user123"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Username) for item in result) + assert result[0].value == "john_doe" + assert result[1].value == "user123" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Username(value="john_doe"), + "ab", # Invalid (too short, must be 3-30 chars) + Username(value="user123"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert result[0].value == "john_doe" + assert result[1].value == "user123" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Username(value="john_doe"), # Object + "user123", # String + {"value": "alice_2023"}, # Dict + {"invalid_key": "bob_smith"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Username) for item in result) + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] + + +def test_preprocess_all_invalid(transform): + """Test preprocess when all items are invalid.""" + inputs = [ + "ab", # Too short (min 3 chars) + "user with spaces", # Invalid characters + "a" * 81, # Too long (max 80 chars) + ] + result = transform.preprocess(inputs) + + assert len(result) == 0 diff --git a/flowsint-transforms/tests/transforms/website/test_to_domain_preprocess.py b/flowsint-transforms/tests/transforms/website/test_to_domain_preprocess.py new file mode 100644 index 0000000..1dfbf07 --- /dev/null +++ b/flowsint-transforms/tests/transforms/website/test_to_domain_preprocess.py @@ -0,0 +1,95 @@ +import pytest +from flowsint_transforms.website.to_domain import WebsiteToDomainTransform +from flowsint_types.website import Website + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return WebsiteToDomainTransform(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Website objects.""" + inputs = [ + Website(url="https://example.com"), + Website(url="https://google.com"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Website) for item in result) + assert str(result[0].url) == "https://example.com/" + assert str(result[1].url) == "https://google.com/" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["https://example.com", "https://google.com"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Website) for item in result) + assert str(result[0].url) == "https://example.com/" + assert str(result[1].url) == "https://google.com/" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"url": "https://example.com"}, + {"url": "https://google.com"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Website) for item in result) + assert str(result[0].url) == "https://example.com/" + assert str(result[1].url) == "https://google.com/" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Website(url="https://example.com"), + "not-a-url", # Invalid + Website(url="https://google.com"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert str(result[0].url) == "https://example.com/" + assert str(result[1].url) == "https://google.com/" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Website(url="https://example.com"), # Object + "https://google.com", # String + {"url": "https://github.com"}, # Dict + {"invalid_key": "https://wrong.com"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Website) for item in result) + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] + + +def test_preprocess_all_invalid(transform): + """Test preprocess when all items are invalid.""" + inputs = [ + "not-a-url", + "ht tp://broken.com", + "", + ] + result = transform.preprocess(inputs) + + assert len(result) == 0 diff --git a/flowsint-transforms/tests/transforms/website/test_to_text_preprocess.py b/flowsint-transforms/tests/transforms/website/test_to_text_preprocess.py new file mode 100644 index 0000000..1142b7b --- /dev/null +++ b/flowsint-transforms/tests/transforms/website/test_to_text_preprocess.py @@ -0,0 +1,83 @@ +import pytest +from flowsint_transforms.website.to_text import WebsiteToText +from flowsint_types.website import Website + + +@pytest.fixture +def transform(): + """Create transform instance for testing.""" + return WebsiteToText(sketch_id="test_sketch", scan_id="test_scan") + + +def test_preprocess_valid_objects(transform): + """Test preprocess with valid Website objects.""" + inputs = [ + Website(url="https://example.com"), + Website(url="https://google.com"), + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Website) for item in result) + assert str(result[0].url) == "https://example.com/" + assert str(result[1].url) == "https://google.com/" + + +def test_preprocess_strings(transform): + """Test preprocess with string inputs (converted via primary field).""" + inputs = ["https://example.com", "https://google.com"] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Website) for item in result) + assert str(result[0].url) == "https://example.com/" + assert str(result[1].url) == "https://google.com/" + + +def test_preprocess_dicts(transform): + """Test preprocess with dict inputs.""" + inputs = [ + {"url": "https://example.com"}, + {"url": "https://google.com"}, + ] + result = transform.preprocess(inputs) + + assert len(result) == 2 + assert all(isinstance(item, Website) for item in result) + assert str(result[0].url) == "https://example.com/" + assert str(result[1].url) == "https://google.com/" + + +def test_preprocess_invalid_filtered(transform): + """Test that invalid items are filtered out.""" + inputs = [ + Website(url="https://example.com"), + "not-a-url", # Invalid + Website(url="https://google.com"), + ] + result = transform.preprocess(inputs) + + # Only valid items should remain + assert len(result) == 2 + assert str(result[0].url) == "https://example.com/" + assert str(result[1].url) == "https://google.com/" + + +def test_preprocess_mixed_formats(transform): + """Test preprocess with mixed input formats.""" + inputs = [ + Website(url="https://example.com"), # Object + "https://google.com", # String + {"url": "https://github.com"}, # Dict + {"invalid_key": "https://wrong.com"}, # Invalid dict (wrong key) + ] + result = transform.preprocess(inputs) + + assert len(result) == 3 # Should have 3 valid items + assert all(isinstance(item, Website) for item in result) + + +def test_preprocess_empty_list(transform): + """Test preprocess with empty input.""" + result = transform.preprocess([]) + assert result == [] diff --git a/flowsint-types/src/flowsint_types/address.py b/flowsint-types/src/flowsint_types/address.py index 8c78534..dd79ab5 100644 --- a/flowsint-types/src/flowsint_types/address.py +++ b/flowsint-types/src/flowsint_types/address.py @@ -1,8 +1,9 @@ -from pydantic import BaseModel, Field -from typing import Optional +from pydantic import Field, model_validator +from typing import Optional, Self +from .flowsint_base import FlowsintType -class Location(BaseModel): +class Location(FlowsintType): """Represents a physical address with geographical coordinates.""" address: str = Field(..., description="Street address", title="Street Address") @@ -15,3 +16,8 @@ class Location(BaseModel): longitude: Optional[float] = Field( None, description="Longitude coordinate of the address", title="Longitude" ) + + @model_validator(mode="after") + def compute_label(self) -> Self: + self.label = f"{self.address} {self.city}, {self.country}" + return self diff --git a/flowsint-types/src/flowsint_types/affiliation.py b/flowsint-types/src/flowsint_types/affiliation.py index 9b594f1..f11d831 100644 --- a/flowsint-types/src/flowsint_types/affiliation.py +++ b/flowsint-types/src/flowsint_types/affiliation.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Affiliation(BaseModel): +class Affiliation(FlowsintType): """Represents an organizational affiliation or employment relationship.""" organization: str = Field( @@ -53,3 +55,11 @@ class Affiliation(BaseModel): description="Hierarchical level within organization", title="Hierarchy Level", ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + if self.role: + self.label = f"{self.role} at {self.organization}" + else: + self.label = self.organization + return self diff --git a/flowsint-types/src/flowsint_types/alias.py b/flowsint-types/src/flowsint_types/alias.py index 2e76612..274fb50 100644 --- a/flowsint-types/src/flowsint_types/alias.py +++ b/flowsint-types/src/flowsint_types/alias.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Alias(BaseModel): +class Alias(FlowsintType): """Represents an alias or alternative name used by an entity.""" alias: str = Field(..., description="Alias or alternative name", title="Alias") @@ -43,3 +45,8 @@ class Alias(BaseModel): region: Optional[str] = Field( None, description="Geographic region where alias is used", title="Region" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = self.alias + return self diff --git a/flowsint-types/src/flowsint_types/asn.py b/flowsint-types/src/flowsint_types/asn.py index 5977fbd..e4334bc 100644 --- a/flowsint-types/src/flowsint_types/asn.py +++ b/flowsint-types/src/flowsint_types/asn.py @@ -1,9 +1,10 @@ -from typing import List, Optional, Union -from pydantic import BaseModel, Field, field_validator, model_validator +from typing import List, Optional, Union, Self +from pydantic import Field, field_validator, model_validator import re +from .flowsint_base import FlowsintType -class ASN(BaseModel): +class ASN(FlowsintType): """Represents an Autonomous System Number with associated network information.""" number: int = Field( @@ -59,12 +60,21 @@ class ASN(BaseModel): return v @model_validator(mode='after') - def populate_asn_str(self) -> 'ASN': + def populate_asn_str(self) -> Self: """Automatically populate asn_str from number if not provided.""" # Always set asn_str based on number to ensure consistency self.asn_str = f"AS{self.number}" return self + @model_validator(mode='after') + def compute_label(self) -> Self: + # Use name and ASN string if available + if self.name: + self.label = f"{self.asn_str} - {self.name}" + else: + self.label = self.asn_str + return self + # Import CIDR here to avoid circular import from .cidr import CIDR diff --git a/flowsint-types/src/flowsint_types/bank_account.py b/flowsint-types/src/flowsint_types/bank_account.py index bbafc74..3ef8864 100644 --- a/flowsint-types/src/flowsint_types/bank_account.py +++ b/flowsint-types/src/flowsint_types/bank_account.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class BankAccount(BaseModel): +class BankAccount(FlowsintType): """Represents a bank account with financial and security information.""" account_number: str = Field( @@ -61,3 +63,12 @@ class BankAccount(BaseModel): breach_source: Optional[str] = Field( None, description="Source of breach if compromised", title="Breach Source" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + parts = [] + if self.bank_name: + parts.append(self.bank_name) + parts.append(f"****{self.account_number[-4:]}" if len(self.account_number) > 4 else self.account_number) + self.label = " - ".join(parts) + return self diff --git a/flowsint-types/src/flowsint_types/breach.py b/flowsint-types/src/flowsint_types/breach.py index e3b5b05..784619f 100644 --- a/flowsint-types/src/flowsint_types/breach.py +++ b/flowsint-types/src/flowsint_types/breach.py @@ -1,8 +1,9 @@ -from typing import List, Optional, Dict -from pydantic import BaseModel, Field +from typing import List, Optional, Dict, Self +from pydantic import Field, model_validator +from .flowsint_base import FlowsintType -class Breach(BaseModel): +class Breach(FlowsintType): """Represents a data breach incident with affected accounts and details.""" name: str = Field( @@ -66,3 +67,12 @@ class Breach(BaseModel): description="Full breach data as returned by the API", title="Full Breach Data", ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Use title if available, otherwise name + if self.title: + self.label = self.title + else: + self.label = self.name + return self diff --git a/flowsint-types/src/flowsint_types/cidr.py b/flowsint-types/src/flowsint_types/cidr.py index 5cbc24b..ed414c4 100644 --- a/flowsint-types/src/flowsint_types/cidr.py +++ b/flowsint-types/src/flowsint_types/cidr.py @@ -1,9 +1,16 @@ -from pydantic import BaseModel, IPvAnyNetwork, Field +from pydantic import IPvAnyNetwork, Field, model_validator +from typing import Self +from .flowsint_base import FlowsintType -class CIDR(BaseModel): +class CIDR(FlowsintType): """Represents a CIDR (Classless Inter-Domain Routing) network block.""" network: IPvAnyNetwork = Field( ..., description="CIDR block (e.g., 8.8.8.0/24)", title="Network Block" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = str(self.network) + return self diff --git a/flowsint-types/src/flowsint_types/credential.py b/flowsint-types/src/flowsint_types/credential.py index a7c75e9..93a2d9b 100644 --- a/flowsint-types/src/flowsint_types/credential.py +++ b/flowsint-types/src/flowsint_types/credential.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Credential(BaseModel): +class Credential(FlowsintType): """Represents user credentials with compromise and usage information.""" username: str = Field(..., description="Username or identifier", title="Username") @@ -62,3 +64,11 @@ class Credential(BaseModel): source: Optional[str] = Field( None, description="Source of credential information", title="Source" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + if self.service: + self.label = f"{self.username}@{self.service}" + else: + self.label = self.username + return self diff --git a/flowsint-types/src/flowsint_types/credit_card.py b/flowsint-types/src/flowsint_types/credit_card.py index f6ef1fc..17d901e 100644 --- a/flowsint-types/src/flowsint_types/credit_card.py +++ b/flowsint-types/src/flowsint_types/credit_card.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class CreditCard(BaseModel): +class CreditCard(FlowsintType): """Represents a credit card with financial details and security status.""" card_number: str = Field(..., description="Credit card number", title="Card Number") @@ -56,3 +58,12 @@ class CreditCard(BaseModel): last_used: Optional[str] = Field( None, description="Last time card was used", title="Last Used" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + parts = [] + if self.card_type: + parts.append(self.card_type) + parts.append(f"****{self.card_number[-4:]}" if len(self.card_number) > 4 else self.card_number) + self.label = " ".join(parts) + return self diff --git a/flowsint-types/src/flowsint_types/device.py b/flowsint-types/src/flowsint_types/device.py index 829c7c1..6f8995f 100644 --- a/flowsint-types/src/flowsint_types/device.py +++ b/flowsint-types/src/flowsint_types/device.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Device(BaseModel): +class Device(FlowsintType): """Represents a device with hardware, software, and network information.""" device_id: str = Field( @@ -63,3 +65,15 @@ class Device(BaseModel): source: Optional[str] = Field( None, description="Source of device information", title="Source" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + parts = [] + if self.manufacturer: + parts.append(self.manufacturer) + if self.model: + parts.append(self.model) + if not parts: + parts.append(self.device_id) + self.label = " ".join(parts) + return self diff --git a/flowsint-types/src/flowsint_types/dns_record.py b/flowsint-types/src/flowsint_types/dns_record.py index aa20a02..9cca5d1 100644 --- a/flowsint-types/src/flowsint_types/dns_record.py +++ b/flowsint-types/src/flowsint_types/dns_record.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class DNSRecord(BaseModel): +class DNSRecord(FlowsintType): """Represents a DNS record with type, value, and security information.""" record_type: str = Field( @@ -48,3 +50,8 @@ class DNSRecord(BaseModel): threat_level: Optional[str] = Field( None, description="Threat level assessment", title="Threat Level" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = f"{self.name} ({self.record_type})" + return self diff --git a/flowsint-types/src/flowsint_types/document.py b/flowsint-types/src/flowsint_types/document.py index 7be38de..2c6f09c 100644 --- a/flowsint-types/src/flowsint_types/document.py +++ b/flowsint-types/src/flowsint_types/document.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Document(BaseModel): +class Document(FlowsintType): """Represents a document with metadata, security, and content information.""" title: str = Field(..., description="Document title", title="Title") @@ -56,3 +58,8 @@ class Document(BaseModel): source: Optional[str] = Field( None, description="Source of document information", title="Source" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = self.title + return self diff --git a/flowsint-types/src/flowsint_types/domain.py b/flowsint-types/src/flowsint_types/domain.py index 00db94f..a46037b 100644 --- a/flowsint-types/src/flowsint_types/domain.py +++ b/flowsint-types/src/flowsint_types/domain.py @@ -1,46 +1,44 @@ -from typing import Optional, Union, Any -from pydantic import BaseModel, Field, field_validator, model_validator +from typing import Optional, Self +from pydantic import Field, field_validator, model_validator from urllib.parse import urlparse import re +from .flowsint_base import FlowsintType -class Domain(BaseModel): +class Domain(FlowsintType): """Represents a domain name and its properties.""" - domain: str = Field(..., description="Domain name", title="Domain Name") + domain: str = Field( + ..., + description="Domain name", + title="Domain name", + json_schema_extra={"primary": True}, + ) root: Optional[bool] = Field( True, description="Is root or not", title="Is Root Domain" ) - @model_validator(mode='before') - @classmethod - def convert_string_to_dict(cls, data: Any) -> Any: - """Allow creating Domain from a string directly.""" - if isinstance(data, str): - return {'domain': data} - return data - - @field_validator('domain') + @field_validator("domain") @classmethod def validate_domain(cls, v: str) -> str: - """Validate that the domain is valid.""" try: - # Parse URL to extract hostname parsed = urlparse(v if "://" in v else "http://" + v) hostname = parsed.hostname or v - - # Check that domain has at least one dot if not hostname or "." not in hostname: - raise ValueError(f"Invalid domain format: {v}") - - # Validate domain format with regex + raise ValueError if not re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", hostname): - raise ValueError(f"Invalid domain format: {v}") - - # Return the cleaned hostname (without protocol) + raise ValueError return hostname - except Exception as e: - raise ValueError(f"Invalid domain: {v}") from e + except Exception: + raise ValueError(f"Invalid domain: {v}") + @model_validator(mode="after") + def check_root(self) -> Self: + parts = self.domain.split(".") + self.root = len(parts) == 2 + return self -Domain.model_rebuild() + @model_validator(mode="after") + def compute_label(self) -> Self: + self.label = self.domain + return self diff --git a/flowsint-types/src/flowsint_types/email.py b/flowsint-types/src/flowsint_types/email.py index f540557..3630460 100644 --- a/flowsint-types/src/flowsint_types/email.py +++ b/flowsint-types/src/flowsint_types/email.py @@ -1,16 +1,14 @@ -from pydantic import BaseModel, Field, EmailStr, model_validator -from typing import Any +from pydantic import Field, EmailStr, model_validator +from typing import Any, Self +from .flowsint_base import FlowsintType -class Email(BaseModel): +class Email(FlowsintType): """Represents an email address.""" email: EmailStr = Field(..., description="Email address", title="Email Address") - @model_validator(mode='before') - @classmethod - def convert_string_to_dict(cls, data: Any) -> Any: - """Allow creating Email from a string directly.""" - if isinstance(data, str): - return {'email': data} - return data + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = self.email + return self diff --git a/flowsint-types/src/flowsint_types/file.py b/flowsint-types/src/flowsint_types/file.py index 608ec3a..fea549b 100644 --- a/flowsint-types/src/flowsint_types/file.py +++ b/flowsint-types/src/flowsint_types/file.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class File(BaseModel): +class File(FlowsintType): """Represents a file with metadata, type information, and security assessment.""" filename: str = Field(..., description="File name", title="Filename") @@ -61,3 +63,8 @@ class File(BaseModel): threat_level: Optional[str] = Field( None, description="Threat level assessment", title="Threat Level" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = self.filename + return self diff --git a/flowsint-types/src/flowsint_types/flowsint_base.py b/flowsint-types/src/flowsint_types/flowsint_base.py new file mode 100644 index 0000000..9cc9312 --- /dev/null +++ b/flowsint-types/src/flowsint_types/flowsint_base.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel, Field +from typing import Optional + + +class FlowsintType(BaseModel): + """Base class for all Flowsint entity types with label support. + Label is optional but computed at definition time. + """ + label: Optional[str] = Field( + None, description="UI-readable label for this entity, the one used on the graph.", title="Label" + ) diff --git a/flowsint-types/src/flowsint_types/gravatar.py b/flowsint-types/src/flowsint_types/gravatar.py index cc1cb05..cf1c555 100644 --- a/flowsint-types/src/flowsint_types/gravatar.py +++ b/flowsint-types/src/flowsint_types/gravatar.py @@ -1,8 +1,9 @@ -from pydantic import BaseModel, Field, HttpUrl -from typing import Optional, List +from pydantic import Field, HttpUrl, model_validator +from typing import Optional, List, Self +from .flowsint_base import FlowsintType -class Gravatar(BaseModel): +class Gravatar(FlowsintType): """Represents a Gravatar profile with image and user information.""" src: HttpUrl = Field(..., description="Gravatar image URL", title="Image URL") @@ -42,3 +43,12 @@ class Gravatar(BaseModel): large_url: Optional[HttpUrl] = Field( None, description="Larger version of the image", title="Large Image URL" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Use display name if available, otherwise hash + if self.display_name: + self.label = f"{self.display_name}" + else: + self.label = f"{self.hash}" + return self diff --git a/flowsint-types/src/flowsint_types/individual.py b/flowsint-types/src/flowsint_types/individual.py index dc67047..4fcdb8c 100644 --- a/flowsint-types/src/flowsint_types/individual.py +++ b/flowsint-types/src/flowsint_types/individual.py @@ -1,12 +1,13 @@ -from pydantic import BaseModel, Field, field_validator -from typing import Optional, Literal, List, Union +from pydantic import Field, field_validator, model_validator +from typing import Optional, Literal, List, Union, Self from .address import Location from .email import Email from .phone import Phone from .ip import Ip +from .flowsint_base import FlowsintType -class Individual(BaseModel): +class Individual(FlowsintType): """Represents an individual person with comprehensive personal information.""" # Basic Information @@ -16,8 +17,8 @@ class Individual(BaseModel): last_name: str = Field( ..., description="Last name of the individual", title="Last Name" ) - full_name: str = Field( - ..., description="Full name of the individual", title="Full Name" + full_name: Optional[str] = Field( + None, description="Full name of the individual", title="Full Name" ) middle_name: Optional[str] = Field( None, description="Middle name or initial", title="Middle Name" @@ -321,7 +322,6 @@ class Individual(BaseModel): for email in v: if not email: continue - try: # If already an Email object, keep it if isinstance(email, Email): @@ -344,12 +344,10 @@ class Individual(BaseModel): """Validate phone numbers in the list and convert to Phone objects.""" if v is None: return None - validated_phones = [] for phone in v: if not phone: continue - try: # If already a Phone object, keep it if isinstance(phone, Phone): @@ -363,7 +361,6 @@ class Individual(BaseModel): except Exception: # Skip invalid phone numbers continue - return validated_phones if validated_phones else None @field_validator('ip_addresses', mode='before') @@ -372,12 +369,10 @@ class Individual(BaseModel): """Validate that all IP addresses in the list are valid and convert to Ip objects.""" if v is None: return None - validated_ips = [] for ip in v: if not ip: continue - try: # If already an Ip object, keep it if isinstance(ip, Ip): @@ -393,3 +388,16 @@ class Individual(BaseModel): continue return validated_ips if validated_ips else None + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Use full_name if available, otherwise concatenate first and last name + if self.full_name: + self.label = self.full_name + elif self.first_name and self.last_name: + self.label = f"{self.first_name} {self.last_name}" + elif self.first_name: + self.label = self.first_name + elif self.last_name: + self.label = self.last_name + return self diff --git a/flowsint-types/src/flowsint_types/ip.py b/flowsint-types/src/flowsint_types/ip.py index a329df4..d4fdb68 100644 --- a/flowsint-types/src/flowsint_types/ip.py +++ b/flowsint-types/src/flowsint_types/ip.py @@ -1,12 +1,18 @@ -from pydantic import BaseModel, Field, field_validator, model_validator -from typing import Optional, Any +from pydantic import Field, field_validator, model_validator +from typing import Optional, Any, Self import ipaddress +from .flowsint_base import FlowsintType -class Ip(BaseModel): +class Ip(FlowsintType): """Represents an IP address with geolocation and ISP information.""" - address: str = Field(..., description="IP address", title="IP Address") + address: str = Field( + ..., + description="IP address", + title="IP Address", + json_schema_extra={"primary": True}, + ) latitude: Optional[float] = Field( None, description="Latitude coordinate of the IP location", title="Latitude" ) @@ -23,15 +29,7 @@ class Ip(BaseModel): None, description="Internet Service Provider", title="ISP" ) - @model_validator(mode='before') - @classmethod - def convert_string_to_dict(cls, data: Any) -> Any: - """Allow creating Ip from a string directly.""" - if isinstance(data, str): - return {'address': data} - return data - - @field_validator('address') + @field_validator("address") @classmethod def validate_ip_address(cls, v: str) -> str: """Validate that the address is a valid IP address.""" @@ -40,3 +38,8 @@ class Ip(BaseModel): return v except ValueError: raise ValueError(f"Invalid IP address: {v}") + + @model_validator(mode="after") + def compute_label(self) -> Self: + self.label = self.address + return self diff --git a/flowsint-types/src/flowsint_types/leak.py b/flowsint-types/src/flowsint_types/leak.py index ec39230..b8cd252 100644 --- a/flowsint-types/src/flowsint_types/leak.py +++ b/flowsint-types/src/flowsint_types/leak.py @@ -1,8 +1,10 @@ -from typing import List, Dict, Optional -from pydantic import BaseModel, Field +from typing import List, Dict, Optional, Self +from pydantic import Field, model_validator + +from .flowsint_base import FlowsintType -class Leak(BaseModel): +class Leak(FlowsintType): """Represents a data leak or breach with associated data.""" name: str = Field( @@ -11,3 +13,8 @@ class Leak(BaseModel): leak: Optional[List[Dict]] = Field( None, description="List of data leaks found", title="Leak Data" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = self.name + return self diff --git a/flowsint-types/src/flowsint_types/malware.py b/flowsint-types/src/flowsint_types/malware.py index b6c3c01..cacfab3 100644 --- a/flowsint-types/src/flowsint_types/malware.py +++ b/flowsint-types/src/flowsint_types/malware.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Malware(BaseModel): +class Malware(FlowsintType): """Represents malware with family, capabilities, and threat intelligence.""" name: str = Field(..., description="Malware name or identifier", title="Name") @@ -58,3 +60,11 @@ class Malware(BaseModel): sample_hashes: Optional[List[str]] = Field( None, description="Sample file hashes", title="Sample Hashes" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + if self.family: + self.label = f"{self.name} ({self.family})" + else: + self.label = self.name + return self diff --git a/flowsint-types/src/flowsint_types/message.py b/flowsint-types/src/flowsint_types/message.py index a16875e..16ac231 100644 --- a/flowsint-types/src/flowsint_types/message.py +++ b/flowsint-types/src/flowsint_types/message.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Message(BaseModel): +class Message(FlowsintType): """Represents a message with content, metadata, and security analysis.""" message_id: str = Field( @@ -60,3 +62,13 @@ class Message(BaseModel): threat_level: Optional[str] = Field( None, description="Threat level assessment", title="Threat Level" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + if self.subject: + self.label = self.subject + else: + # Truncate content to first 50 characters + content_preview = self.content[:50] + "..." if len(self.content) > 50 else self.content + self.label = content_preview + return self diff --git a/flowsint-types/src/flowsint_types/organization.py b/flowsint-types/src/flowsint_types/organization.py index 3f84dc9..e1b538b 100644 --- a/flowsint-types/src/flowsint_types/organization.py +++ b/flowsint-types/src/flowsint_types/organization.py @@ -1,10 +1,11 @@ -from pydantic import BaseModel, Field, model_validator -from typing import Any, Optional, List +from pydantic import Field, model_validator +from typing import Any, Optional, List, Self from .individual import Individual from .address import Location +from .flowsint_base import FlowsintType -class Organization(BaseModel): +class Organization(FlowsintType): """Represents an organization with detailed business and administrative information.""" # Basic information @@ -369,3 +370,14 @@ class Organization(BaseModel): complements_type_siae: Optional[Any] = Field( None, description="Complements SIAE type", title="SIAE Type" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Use the full name if available, otherwise use name + if self.nom_complet: + self.label = str(self.nom_complet) + elif self.nom_raison_sociale: + self.label = str(self.nom_raison_sociale) + elif self.name: + self.label = str(self.name) + return self diff --git a/flowsint-types/src/flowsint_types/phone.py b/flowsint-types/src/flowsint_types/phone.py index 6e503be..3b3aaef 100644 --- a/flowsint-types/src/flowsint_types/phone.py +++ b/flowsint-types/src/flowsint_types/phone.py @@ -1,10 +1,11 @@ -from pydantic import BaseModel, Field, field_validator, model_validator -from typing import Optional, Any +from pydantic import Field, field_validator, model_validator +from typing import Optional, Any, Self import phonenumbers from phonenumbers import NumberParseException +from .flowsint_base import FlowsintType -class Phone(BaseModel): +class Phone(FlowsintType): """Represents a phone number with country and carrier information.""" number: str = Field(..., description="Phone number", title="Phone Number") @@ -54,3 +55,8 @@ class Phone(BaseModel): # If all attempts fail, raise an error raise ValueError(f"Invalid phone number: {v}. Must be in international format (+...) or a valid format for common regions.") + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = self.number + return self diff --git a/flowsint-types/src/flowsint_types/phrase.py b/flowsint-types/src/flowsint_types/phrase.py index 678832c..d9b4dd5 100644 --- a/flowsint-types/src/flowsint_types/phrase.py +++ b/flowsint-types/src/flowsint_types/phrase.py @@ -1,10 +1,19 @@ -from pydantic import BaseModel, Field -from typing import Any +from pydantic import Field, model_validator +from typing import Any, Self + +from .flowsint_base import FlowsintType -class Phrase(BaseModel): +class Phrase(FlowsintType): """Represents a phrase or text content.""" text: Any = Field( ..., description="The content of the phrase.", title="Phrase text value." ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + text_str = str(self.text) + # Truncate to 100 characters for display + self.label = text_str[:100] + "..." if len(text_str) > 100 else text_str + return self diff --git a/flowsint-types/src/flowsint_types/port.py b/flowsint-types/src/flowsint_types/port.py index 384bede..aec1609 100644 --- a/flowsint-types/src/flowsint_types/port.py +++ b/flowsint-types/src/flowsint_types/port.py @@ -1,8 +1,9 @@ -from pydantic import BaseModel, Field, field_validator -from typing import Optional +from pydantic import Field, field_validator, model_validator +from typing import Optional, Self +from .flowsint_base import FlowsintType -class Port(BaseModel): +class Port(FlowsintType): """Represents an open network port related to an IP address.""" number: int = Field(..., description="Port number", title="Port Number") @@ -26,3 +27,14 @@ class Port(BaseModel): if not (0 <= v <= 65535): raise ValueError(f"Port number must be between 0 and 65535, got {v}") return v + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Include service and protocol if available + parts = [str(self.number)] + if self.service: + parts.append(self.service) + if self.protocol: + parts.append(f"({self.protocol})") + self.label = " ".join(parts) + return self diff --git a/flowsint-types/src/flowsint_types/reputation_score.py b/flowsint-types/src/flowsint_types/reputation_score.py index c08693e..93a2e6f 100644 --- a/flowsint-types/src/flowsint_types/reputation_score.py +++ b/flowsint-types/src/flowsint_types/reputation_score.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class ReputationScore(BaseModel): +class ReputationScore(FlowsintType): """Represents a reputation score for an entity with historical data and trends.""" entity_id: str = Field(..., description="Entity identifier", title="Entity ID") @@ -54,3 +56,11 @@ class ReputationScore(BaseModel): recommendations: Optional[List[str]] = Field( None, description="Recommendations based on score", title="Recommendations" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + parts = [self.entity_id] + if self.score is not None: + parts.append(f"Score: {self.score}") + self.label = " - ".join(parts) + return self diff --git a/flowsint-types/src/flowsint_types/risk_profile.py b/flowsint-types/src/flowsint_types/risk_profile.py index 38a5232..ac227cd 100644 --- a/flowsint-types/src/flowsint_types/risk_profile.py +++ b/flowsint-types/src/flowsint_types/risk_profile.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class RiskProfile(BaseModel): +class RiskProfile(FlowsintType): """Represents a comprehensive risk assessment profile for an entity.""" entity_id: str = Field(..., description="Entity identifier", title="Entity ID") @@ -71,3 +73,11 @@ class RiskProfile(BaseModel): next_review_date: Optional[str] = Field( None, description="Next review date", title="Next Review Date" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + parts = [self.entity_id] + if self.risk_level: + parts.append(f"Risk: {self.risk_level}") + self.label = " - ".join(parts) + return self diff --git a/flowsint-types/src/flowsint_types/script.py b/flowsint-types/src/flowsint_types/script.py index 7d564fe..c7bed27 100644 --- a/flowsint-types/src/flowsint_types/script.py +++ b/flowsint-types/src/flowsint_types/script.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Script(BaseModel): +class Script(FlowsintType): """Represents a script or code file with analysis and security information.""" script_id: str = Field( @@ -63,3 +65,11 @@ class Script(BaseModel): minified: Optional[bool] = Field( None, description="Whether script is minified", title="Minified" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + if self.name: + self.label = self.name + else: + self.label = self.script_id + return self diff --git a/flowsint-types/src/flowsint_types/session.py b/flowsint-types/src/flowsint_types/session.py index 9d025e4..bd3692f 100644 --- a/flowsint-types/src/flowsint_types/session.py +++ b/flowsint-types/src/flowsint_types/session.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class Session(BaseModel): +class Session(FlowsintType): """Represents a user session with device and activity information.""" session_id: str = Field( @@ -49,3 +51,15 @@ class Session(BaseModel): source: Optional[str] = Field( None, description="Source of session information", title="Source" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + parts = [] + if self.user_id: + parts.append(self.user_id) + if self.service: + parts.append(self.service) + if not parts: + parts.append(self.session_id) + self.label = " - ".join(parts) + return self diff --git a/flowsint-types/src/flowsint_types/social_account.py b/flowsint-types/src/flowsint_types/social_account.py index efa4b37..a8954fd 100644 --- a/flowsint-types/src/flowsint_types/social_account.py +++ b/flowsint-types/src/flowsint_types/social_account.py @@ -1,9 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self from .username import Username +from .flowsint_base import FlowsintType -class SocialAccount(BaseModel): +class SocialAccount(FlowsintType): """Represents a social media account (the 'home' of a username).""" username: Username = Field(..., description="Username associated with this account", title="Username") @@ -22,3 +23,12 @@ class SocialAccount(BaseModel): is_suspended: Optional[bool] = Field(None, description="Whether the account is suspended/banned", title="Is suspended") associated_emails: Optional[List[str]] = Field(None, description="Email addresses associated with the account", title="Associated emails") associated_phones: Optional[List[str]] = Field(None, description="Phone numbers associated with the account", title="Associated phones") + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Use display name if available, otherwise username + if self.display_name: + self.label = f"{self.display_name} (@{self.username.value})" + else: + self.label = f"@{self.username.value}" + return self diff --git a/flowsint-types/src/flowsint_types/ssl_certificate.py b/flowsint-types/src/flowsint_types/ssl_certificate.py index 3e33c65..8abc92f 100644 --- a/flowsint-types/src/flowsint_types/ssl_certificate.py +++ b/flowsint-types/src/flowsint_types/ssl_certificate.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class SSLCertificate(BaseModel): +class SSLCertificate(FlowsintType): """Represents an SSL/TLS certificate with validation and security details.""" subject: str = Field( @@ -69,3 +71,8 @@ class SSLCertificate(BaseModel): fingerprint_sha256: Optional[str] = Field( None, description="SHA256 fingerprint", title="SHA256 Fingerprint" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = self.subject + return self diff --git a/flowsint-types/src/flowsint_types/username.py b/flowsint-types/src/flowsint_types/username.py index 732d3b0..14c2b51 100644 --- a/flowsint-types/src/flowsint_types/username.py +++ b/flowsint-types/src/flowsint_types/username.py @@ -1,9 +1,10 @@ -from pydantic import BaseModel, Field, field_validator, model_validator -from typing import Optional, Any +from pydantic import Field, field_validator, model_validator +from typing import Optional, Any, Self import re +from .flowsint_base import FlowsintType -class Username(BaseModel): +class Username(FlowsintType): """Represents a username or handle on any platform.""" value: str = Field(..., description="Username or handle string", title="Username value") @@ -12,27 +13,27 @@ class Username(BaseModel): None, description="Last time this username was observed", title="Last seen at" ) - @model_validator(mode='before') - @classmethod - def convert_string_to_dict(cls, data: Any) -> Any: - """Allow creating Username from a string directly.""" - if isinstance(data, str): - return {'value': data} - return data - @field_validator('value') @classmethod def validate_username(cls, v: str) -> str: """Validate username format. - Username must be 3-30 characters long and contain only: + Username must be 3-80 characters long and contain only: - Letters (a-z, A-Z) - Numbers (0-9) - Underscores (_) - Hyphens (-) """ - if not re.match(r"^[a-zA-Z0-9_-]{3,30}$", v): + if not re.match(r"^[a-zA-Z0-9_-]{3,80}$", v): raise ValueError( - f"Invalid username: {v}. Must be 3-30 characters and contain only letters, numbers, underscores, and hyphens." + f"Invalid username: {v}. Must be 3-80 characters and contain only letters, numbers, underscores, and hyphens." ) return v + + @model_validator(mode='after') + def compute_label(self) -> Self: + if self.platform: + self.label = f"@{self.value} ({self.platform})" + else: + self.label = f"@{self.value}" + return self diff --git a/flowsint-types/src/flowsint_types/wallet.py b/flowsint-types/src/flowsint_types/wallet.py index 6fd47b6..95f04a4 100644 --- a/flowsint-types/src/flowsint_types/wallet.py +++ b/flowsint-types/src/flowsint_types/wallet.py @@ -1,8 +1,10 @@ -from typing import Optional -from pydantic import BaseModel, Field, HttpUrl +from typing import Optional, Self +from pydantic import Field, HttpUrl, field_validator, model_validator +import re +from .flowsint_base import FlowsintType -class CryptoWallet(BaseModel): +class CryptoWallet(FlowsintType): """Represents a cryptocurrency wallet.""" address: str = Field(..., description="Wallet address", title="Wallet Address") @@ -10,8 +12,41 @@ class CryptoWallet(BaseModel): None, description="Wallet Explorer node ID", title="Node ID" ) + @field_validator('address') + @classmethod + def validate_address(cls, v: str) -> str: + """Validate that the wallet address is not empty and has a valid format.""" + if not v or not v.strip(): + raise ValueError("Wallet address cannot be empty") -class CryptoWalletTransaction(BaseModel): + # Strip whitespace + v = v.strip() + + # Basic validation: check if it looks like a valid crypto address + # Ethereum addresses start with 0x and are 42 characters (0x + 40 hex chars) + # Bitcoin addresses vary but are typically 26-35 characters + # We'll do a permissive check for common formats + if len(v) < 26: + raise ValueError("Wallet address is too short to be valid") + + # Check for common patterns + ethereum_pattern = r'^0x[a-fA-F0-9]{40}$' + bitcoin_pattern = r'^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$|^bc1[a-z0-9]{39,59}$' + + # If it matches Ethereum pattern, validate it + if v.startswith('0x'): + if not re.match(ethereum_pattern, v): + raise ValueError("Invalid Ethereum address format") + + return v + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = self.address + return self + + +class CryptoWalletTransaction(FlowsintType): """Represents a cryptocurrency transaction.""" source: CryptoWallet = Field( @@ -82,8 +117,35 @@ class CryptoWalletTransaction(BaseModel): None, description="Error message if transaction failed", title="Error Message" ) + @field_validator('value', 'amount', 'amount_usd') + @classmethod + def validate_positive_amounts(cls, v: Optional[float]) -> Optional[float]: + """Validate that monetary amounts are non-negative.""" + if v is not None and v < 0: + raise ValueError("Monetary amounts must be non-negative") + return v -class CryptoNFT(BaseModel): + @field_validator('gas', 'gas_price', 'gas_used', 'cumulative_gas_used', 'block_number', 'nonce', 'transaction_index', 'confirmations', 'hop') + @classmethod + def validate_non_negative_integers(cls, v: Optional[int]) -> Optional[int]: + """Validate that integer fields are non-negative.""" + if v is not None and v < 0: + raise ValueError("Integer values must be non-negative") + return v + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Use hash if available, otherwise create a descriptive label + if self.hash: + self.label = self.hash + elif self.source and self.target: + self.label = f"Transaction from {self.source.address[:8]}... to {self.target.address[:8]}..." + elif self.source: + self.label = f"Transaction from {self.source.address[:8]}..." + return self + + +class CryptoNFT(FlowsintType): """Represents a Non-Fungible Token (NFT) held or minted by a wallet.""" wallet: CryptoWallet = Field(..., description="Source wallet", title="Wallet") @@ -135,6 +197,41 @@ class CryptoNFT(BaseModel): def uid(self): return f"{self.contract_address}:{self.token_id}" + @field_validator('contract_address') + @classmethod + def validate_contract_address(cls, v: str) -> str: + """Validate that the NFT contract address has a valid format.""" + if not v or not v.strip(): + raise ValueError("Contract address cannot be empty") + + v = v.strip() + + # NFT contracts are typically on Ethereum, so validate as Ethereum address + ethereum_pattern = r'^0x[a-fA-F0-9]{40}$' + if not re.match(ethereum_pattern, v): + raise ValueError("Invalid contract address format (expected Ethereum address: 0x followed by 40 hex characters)") + + return v + + @field_validator('token_id') + @classmethod + def validate_token_id(cls, v: str) -> str: + """Validate that the token ID is not empty.""" + if not v or not v.strip(): + raise ValueError("Token ID cannot be empty") + return v.strip() + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Prefer name, then collection_name with token_id, fallback to uid + if self.name: + self.label = self.name + elif self.collection_name: + self.label = f"{self.collection_name} #{self.token_id}" + else: + self.label = self.uid + return self + # Update forward references CryptoWallet.model_rebuild() diff --git a/flowsint-types/src/flowsint_types/weapon.py b/flowsint-types/src/flowsint_types/weapon.py index ee0f424..e418ef2 100644 --- a/flowsint-types/src/flowsint_types/weapon.py +++ b/flowsint-types/src/flowsint_types/weapon.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Literal, Optional, List +from pydantic import Field, model_validator +from typing import Literal, Optional, List, Self + +from .flowsint_base import FlowsintType -class Weapon(BaseModel): +class Weapon(FlowsintType): """Represents a weapon with detailed specifications and forensic information.""" name: str = Field(..., description="Weapon name or identifier", title="Name") @@ -112,3 +114,11 @@ class Weapon(BaseModel): notes: Optional[str] = Field( None, description="Additional notes or observations", title="Notes" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + parts = [self.name] + if self.type: + parts.append(f"({self.type})") + self.label = " ".join(parts) + return self diff --git a/flowsint-types/src/flowsint_types/web_tracker.py b/flowsint-types/src/flowsint_types/web_tracker.py index 36ba045..c9eb5e8 100644 --- a/flowsint-types/src/flowsint_types/web_tracker.py +++ b/flowsint-types/src/flowsint_types/web_tracker.py @@ -1,8 +1,10 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import Field, model_validator +from typing import Optional, List, Self + +from .flowsint_base import FlowsintType -class WebTracker(BaseModel): +class WebTracker(FlowsintType): """Represents a web tracking technology with privacy and compliance information.""" tracker_id: str = Field( @@ -57,3 +59,11 @@ class WebTracker(BaseModel): risk_level: Optional[str] = Field( None, description="Privacy risk level", title="Risk Level" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + if self.name: + self.label = self.name + else: + self.label = self.tracker_id + return self diff --git a/flowsint-types/src/flowsint_types/website.py b/flowsint-types/src/flowsint_types/website.py index 3e98636..bf3d9ef 100644 --- a/flowsint-types/src/flowsint_types/website.py +++ b/flowsint-types/src/flowsint_types/website.py @@ -1,9 +1,10 @@ -from typing import List, Optional -from pydantic import BaseModel, Field, HttpUrl +from typing import List, Optional, Self +from pydantic import Field, HttpUrl, model_validator from .domain import Domain +from .flowsint_base import FlowsintType -class Website(BaseModel): +class Website(FlowsintType): """Represents a website with its URL, domain, and redirect information.""" url: HttpUrl = Field( @@ -18,3 +19,8 @@ class Website(BaseModel): active: Optional[bool] = Field( False, description="Whether the website is active", title="Is Active" ) + + @model_validator(mode='after') + def compute_label(self) -> Self: + self.label = str(self.url) + return self diff --git a/flowsint-types/src/flowsint_types/whois.py b/flowsint-types/src/flowsint_types/whois.py index 6733536..7d2516d 100644 --- a/flowsint-types/src/flowsint_types/whois.py +++ b/flowsint-types/src/flowsint_types/whois.py @@ -1,10 +1,12 @@ -from typing import Optional, Union -from pydantic import BaseModel, Field, field_validator +from typing import Optional, Union, Self +from pydantic import Field, field_validator, model_validator from .email import Email from .domain import Domain +from .organization import Organization +from .flowsint_base import FlowsintType -class Whois(BaseModel): +class Whois(FlowsintType): """Represents WHOIS domain registration information.""" domain: Domain = Field(..., description="Domain information", title="Domain") @@ -14,9 +16,9 @@ class Whois(BaseModel): registrar: Optional[str] = Field( None, description="Domain registrar name", title="Registrar" ) - org: Optional[str] = Field( + organization: Optional[Organization] = Field( None, - description="Organization name associated with the domain", + description="Organization associated with the domain", title="Organization", ) city: Optional[str] = Field( @@ -46,3 +48,26 @@ class Whois(BaseModel): elif isinstance(v, dict): return Domain(**v) return v + + @field_validator('organization', mode='before') + @classmethod + def convert_organization(cls, v: Union[str, dict, Organization, None]) -> Optional[Organization]: + """Convert string or dict to Organization object if needed.""" + if v is None: + return None + if isinstance(v, Organization): + return v + elif isinstance(v, str): + return Organization(name=v) + elif isinstance(v, dict): + return Organization(**v) + return v + + @model_validator(mode='after') + def compute_label(self) -> Self: + # Use domain and organization if available + if self.organization: + self.label = f"{self.domain.domain} - {self.organization.name}" + else: + self.label = self.domain.domain + return self diff --git a/flowsint-types/tests/domain.py b/flowsint-types/tests/domain.py new file mode 100644 index 0000000..5ffbd35 --- /dev/null +++ b/flowsint-types/tests/domain.py @@ -0,0 +1,42 @@ +from flowsint_types.domain import Domain +import pytest + + +def test_valid_domain_from_object(): + domain = Domain(**{"domain": "mydomain.com"}) + assert domain.domain == "mydomain.com" + assert domain.label == "mydomain.com" + assert domain.root == True + + +def test_valid_subbomain_from_object(): + domain = Domain(**{"domain": "blog.mydomain.com"}) + assert domain.domain == "blog.mydomain.com" + assert domain.label == "blog.mydomain.com" + assert domain.root == False + + +def test_valid_domain_from_instance(): + domain = Domain(domain="mydomain.com") + assert domain.domain == "mydomain.com" + assert domain.label == "mydomain.com" + assert domain.root == True + + +def test_valid_subdomain_from_instance(): + domain = Domain(domain="blog.mydomain.com") + assert domain.domain == "blog.mydomain.com" + assert domain.label == "blog.mydomain.com" + assert domain.root == False + + +def test_invalid_domain_from_object(): + with pytest.raises(Exception) as e_info: + Domain(**{"domain": "my_domain.com"}) + assert "Invalid domain" in str(e_info.value) + + +def test_domain_type_from_none(): + with pytest.raises(Exception) as e_info: + Domain() + assert "1 validation error for Domain" in str(e_info.value) diff --git a/flowsint-types/tests/individual.py b/flowsint-types/tests/individual.py new file mode 100644 index 0000000..cfe81bd --- /dev/null +++ b/flowsint-types/tests/individual.py @@ -0,0 +1,15 @@ +from flowsint_types.individual import Individual +import pytest + + +def test_valid_individual(): + indivudual = Individual(first_name="John", last_name="Doe") + assert indivudual.first_name == "John" + assert indivudual.last_name == "Doe" + assert indivudual.label == "John Doe" + + +def test_invalid_individual(): + with pytest.raises(Exception) as e_info: + Individual(name="John Doe") + assert "validation errors for Individual" in str(e_info.value) diff --git a/flowsint-types/tests/ip.py b/flowsint-types/tests/ip.py new file mode 100644 index 0000000..5c34151 --- /dev/null +++ b/flowsint-types/tests/ip.py @@ -0,0 +1,14 @@ +from flowsint_types.ip import Ip +import pytest + + +def test_valid_ip(): + ip = Ip(address="12.23.34.56") + assert ip.address == "12.23.34.56" + assert ip.label == "12.23.34.56" + + +def test_invalid_ip(): + with pytest.raises(Exception) as e_info: + Ip(address="12.23.34.564") + assert "Invalid IP address" in str(e_info.value)