diff --git a/flowsint-api/app/api/routes/events.py b/flowsint-api/app/api/routes/events.py
index 27cc864..8d1050c 100644
--- a/flowsint-api/app/api/routes/events.py
+++ b/flowsint-api/app/api/routes/events.py
@@ -29,7 +29,7 @@ def get_logs_by_sketch(
raise HTTPException(status_code=404, detail=f"Sketch with id {sketch_id} not found")
print(f"[EventEmitter] Fetching logs for sketch {sketch_id} (limit: {limit}, since: {since})")
- query = db.query(Log).filter(Log.sketch_id == sketch_id).order_by(Log.created_at.asc())
+ query = db.query(Log).filter(Log.sketch_id == sketch_id).order_by(Log.created_at.desc())
if since:
query = query.filter(Log.created_at > since)
@@ -39,6 +39,9 @@ def get_logs_by_sketch(
logs = query.limit(limit).all()
+ # Reverse to show chronologically (oldest to newest)
+ logs = list(reversed(logs))
+
results = []
for log in logs:
# Ensure payload is always a dictionary
diff --git a/flowsint-api/app/api/routes/transforms.py b/flowsint-api/app/api/routes/transforms.py
index 564d52d..0376f1b 100644
--- a/flowsint-api/app/api/routes/transforms.py
+++ b/flowsint-api/app/api/routes/transforms.py
@@ -69,17 +69,18 @@ async def get_material_list():
scanner_categories = {
category: [
{
- "class_name": scanner["class_name"],
- "category": scanner["category"],
- "name": scanner["name"],
- "module": scanner["module"],
- "doc": scanner["doc"],
- "inputs": scanner["inputs"],
- "outputs": scanner["outputs"],
+ "class_name": scanner.get("class_name"),
+ "category": scanner.get("category"),
+ "name": scanner.get("name"),
+ "module": scanner.get("module"),
+ "doc": scanner.get("doc"),
+ "inputs": scanner.get("inputs"),
+ "outputs": scanner.get("outputs"),
"type": "scanner",
- "params": scanner["params"],
- "params_schema": scanner["params_schema"],
- "requires_key": scanner["requires_key"]
+ "params": scanner.get("params"),
+ "params_schema": scanner.get("params_schema"),
+ "required_params": scanner.get("required_params"),
+ "icon": scanner.get("icon")
}
for scanner in scanner_list
]
diff --git a/flowsint-api/app/scanners/base.py b/flowsint-api/app/scanners/base.py
index 5921c41..16b475a 100644
--- a/flowsint-api/app/scanners/base.py
+++ b/flowsint-api/app/scanners/base.py
@@ -1,17 +1,15 @@
from abc import ABC, abstractmethod
-from typing import List, Dict, Any, Optional
-from pydantic import ValidationError, BaseModel, Field, create_model
+from typing import List, Dict, Any, Optional, get_origin, get_args
+from pydantic import ValidationError, BaseModel, Field, create_model, TypeAdapter
from pydantic.config import ConfigDict
-
from app.core.graph_db import Neo4jConnection
from app.core.logger import Logger
from app.core.vault import VaultProtocol
-
+from app.utils import resolve_type
class InvalidScannerParams(Exception):
pass
-
def build_params_model(params_schema: list) -> BaseModel:
"""
Build a strict Pydantic model from a params_schema.
@@ -21,10 +19,10 @@ def build_params_model(params_schema: list) -> BaseModel:
for param in params_schema:
name = param["name"]
- typ = str # You can later enhance this to support int, bool, etc.
+ type = str # You can later enhance this to support int, bool, etc.
required = param.get("required", False)
default = ... if required else param.get("default")
- fields[name] = (Optional[typ], Field(default=default, description=param.get("description", "")))
+ fields[name] = (Optional[type], Field(default=default, description=param.get("description", "")))
model = create_model(
"ParamsModel",
@@ -35,6 +33,64 @@ def build_params_model(params_schema: list) -> BaseModel:
return model
class Scanner(ABC):
+ """
+ Abstract base class for all scanners.
+
+ ## InputType and OutputType Pattern
+
+ Scanners only need to define InputType and OutputType as class attributes.
+ The base class automatically handles schema generation:
+
+ ```python
+ from typing import List
+ from app.types.domain import Domain
+ from app.types.ip import Ip
+
+ class MyScanner(Scanner):
+ # Define types as class attributes
+ InputType = List[Domain]
+ OutputType = List[Ip]
+
+ @classmethod
+ def name(cls):
+ return "my_scanner"
+
+ @classmethod
+ def category(cls):
+ return "Domain"
+
+ @classmethod
+ def key(cls):
+ return "domain"
+
+ def preprocess(self, data: InputType) -> InputType:
+ cleaned: InputType = []
+ # ... implementation
+ return cleaned
+
+ async def scan(self, data: InputType) -> OutputType:
+ results: OutputType = []
+ # ... implementation
+ return results
+
+ # Make types available at module level for easy access
+ InputType = MyScanner.InputType
+ OutputType = MyScanner.OutputType
+ ```
+
+ The base class automatically provides:
+ - input_schema() method using InputType
+ - output_schema() method using OutputType
+ - Error handling for missing type definitions
+ - Consistent schema generation across all scanners
+
+ Subclasses can override input_schema() or output_schema() if needed for special cases.
+ """
+
+ # Abstract type aliases that must be defined in subclasses for runtime use
+ InputType = NotImplemented
+ OutputType = NotImplemented
+
def __init__(
self,
sketch_id: Optional[str] = None,
@@ -106,13 +162,17 @@ class Scanner(ABC):
@classmethod
- def requires_key(self) -> bool:
+ def required_params(self) -> bool:
return False
@classmethod
@abstractmethod
def name(cls) -> str:
pass
+
+ @classmethod
+ def icon(cls) -> str | None:
+ return None
@classmethod
@abstractmethod
@@ -126,9 +186,12 @@ class Scanner(ABC):
pass
@classmethod
- @abstractmethod
def input_schema(cls) -> Dict[str, Any]:
- pass
+ """
+ Generate input schema from InputType class attribute.
+ Subclasses don't need to override this unless they have special requirements.
+ """
+ return cls.generate_input_schema()
@classmethod
def get_params_schema(cls) -> List[Dict[str, Any]]:
@@ -136,9 +199,92 @@ class Scanner(ABC):
return []
@classmethod
- @abstractmethod
def output_schema(cls) -> Dict[str, Any]:
- pass
+ """
+ Generate output schema from OutputType class attribute.
+ Subclasses don't need to override this unless they have special requirements.
+ """
+ return cls.generate_output_schema()
+
+ @classmethod
+ def generate_input_schema(cls) -> Dict[str, Any]:
+ """
+ Helper method to generate input schema from InputType class attribute.
+
+ Raises:
+ NotImplementedError: If InputType is not defined in the subclass
+ """
+ if cls.InputType is NotImplemented:
+ raise NotImplementedError(f"InputType must be defined in {cls.__name__}")
+
+ adapter = TypeAdapter(cls.InputType)
+ schema = adapter.json_schema()
+
+ # Handle different schema structures
+ if "$defs" in schema and schema["$defs"]:
+ # Follow the $ref in items to get the correct type (not just the first one)
+ items_ref = schema.get("items", {}).get("$ref")
+ if items_ref:
+ # Extract type name from $ref like "#/$defs/Website" -> "Website"
+ type_name = items_ref.split("/")[-1]
+ details = schema["$defs"][type_name]
+ else:
+ # Fallback: get the first type definition (for backward compatibility)
+ type_name, details = list(schema["$defs"].items())[0]
+
+ return {
+ "type": type_name,
+ "properties": [
+ {"name": prop, "type": resolve_type(info, schema)}
+ for prop, info in details["properties"].items()
+ ]
+ }
+ else:
+ # Handle simpler schemas
+ return {
+ "type": schema.get("title", "Any"),
+ "properties": [{"name": "value", "type": "object"}]
+ }
+
+ @classmethod
+ def generate_output_schema(cls) -> Dict[str, Any]:
+ """
+ Helper method to generate output schema from OutputType class attribute.
+
+ Raises:
+ NotImplementedError: If OutputType is not defined in the subclass
+ """
+ if cls.OutputType is NotImplemented:
+ raise NotImplementedError(f"OutputType must be defined in {cls.__name__}")
+
+ adapter = TypeAdapter(cls.OutputType)
+ schema = adapter.json_schema()
+
+ # Handle different schema structures
+ if "$defs" in schema and schema["$defs"]:
+ # Follow the $ref in items to get the correct type (not just the first one)
+ items_ref = schema.get("items", {}).get("$ref")
+ if items_ref:
+ # Extract type name from $ref like "#/$defs/Website" -> "Website"
+ type_name = items_ref.split("/")[-1]
+ details = schema["$defs"][type_name]
+ else:
+ # Fallback: get the first type definition (for backward compatibility)
+ type_name, details = list(schema["$defs"].items())[0]
+
+ return {
+ "type": type_name,
+ "properties": [
+ {"name": prop, "type": resolve_type(info, schema)}
+ for prop, info in details["properties"].items()
+ ]
+ }
+ else:
+ # Handle simpler schemas
+ return {
+ "type": schema.get("title", "Any"),
+ "properties": [{"name": "value", "type": "object"}]
+ }
@abstractmethod
async def scan(self, values: List[str]) -> List[Dict[str, Any]]:
@@ -159,7 +305,6 @@ class Scanner(ABC):
async def execute(self, values: List[str]) -> List[Dict[str, Any]]:
if self.name() != "transform_orchestrator":
Logger.info(self.sketch_id, {"message": f"Scanner {self.name()} started."})
-
try:
await self.async_init()
preprocessed = self.preprocess(values)
diff --git a/flowsint-api/app/scanners/crypto/wallet_to_nfts.py b/flowsint-api/app/scanners/crypto/wallet_to_nfts.py
index b87d699..325318c 100644
--- a/flowsint-api/app/scanners/crypto/wallet_to_nfts.py
+++ b/flowsint-api/app/scanners/crypto/wallet_to_nfts.py
@@ -1,20 +1,21 @@
import os
import socket
-from typing import List, Dict, Any, Optional, TypeAlias, Union
-from pydantic import TypeAdapter
+from typing import List, Dict, Any, Optional, Union
import requests
from app.scanners.base import Scanner
from app.types.wallet import CryptoWallet, CryptoNFT
-from app.utils import resolve_type
from app.core.logger import Logger
from app.core.graph_db import Neo4jConnection
-InputType: TypeAlias = List[CryptoWallet]
-OutputType: TypeAlias = List[CryptoNFT]
ETHERSCAN_API_URL = os.getenv("ETHERSCAN_API_URL")
class CryptoWalletAddressToNFTs(Scanner):
"""Resolve NFTs for a wallet address (ETH)."""
+
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[CryptoWallet]
+ OutputType = List[CryptoNFT]
+
def __init__(
self,
sketch_id: Optional[str] = None,
@@ -33,7 +34,7 @@ class CryptoWalletAddressToNFTs(Scanner):
)
@classmethod
- def requires_key(cls) -> bool:
+ def required_params(cls) -> bool:
return True
@classmethod
@@ -67,32 +68,6 @@ class CryptoWalletAddressToNFTs(Scanner):
def key(cls) -> str:
return "address"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
@@ -108,12 +83,11 @@ class CryptoWalletAddressToNFTs(Scanner):
return cleaned
async def scan(self, data: InputType) -> OutputType:
- results: OutputType = []
results: OutputType = []
params = self.get_params()
- Logger.warn(self.sketch_id, {"message": f"{str(params)}"})
- api_key = params["ETHERSCAN_API_KEY"]
- api_url = params["ETHERSCAN_API_URL"]
+ Logger.debug(self.sketch_id, {"message": f"{str(params)}"})
+ api_key = params.get("ETHERSCAN_API_KEY", None)
+ api_url = params.get("ETHERSCAN_API_URL", None)
if not api_key:
Logger.error(self.sketch_id, {"message": "ETHERSCAN_API_KEY is required"})
raise ValueError("ETHERSCAN_API_KEY is required")
@@ -212,4 +186,8 @@ class CryptoWalletAddressToNFTs(Scanner):
Logger.graph_append(self.sketch_id, {"message": f"Found NFT for {nft.wallet.address}: {nft.contract_address} - {nft.token_id}"})
- return results
\ No newline at end of file
+ return results
+
+# Make types available at module level for easy access
+InputType = CryptoWalletAddressToNFTs.InputType
+OutputType = CryptoWalletAddressToNFTs.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/crypto/wallet_to_transactions.py b/flowsint-api/app/scanners/crypto/wallet_to_transactions.py
index a6ae78e..6beb1eb 100644
--- a/flowsint-api/app/scanners/crypto/wallet_to_transactions.py
+++ b/flowsint-api/app/scanners/crypto/wallet_to_transactions.py
@@ -1,24 +1,24 @@
import os
-from typing import List, Dict, Any, Optional, TypeAlias, Union
-from pydantic import TypeAdapter
+from typing import List, Dict, Any, Optional, Union
import requests
import requests.exceptions
from datetime import datetime
from app.scanners.base import Scanner
from app.types.wallet import CryptoWallet, CryptoWalletTransaction
-from app.utils import resolve_type
from app.core.logger import Logger
from app.core.graph_db import Neo4jConnection
-InputType: TypeAlias = List[CryptoWallet]
-OutputType: TypeAlias = List[CryptoWalletTransaction]
-
ETHERSCAN_API_URL = os.getenv("ETHERSCAN_API_URL")
def wei_to_eth(wei_str):
return int(wei_str) / 10**18
class CryptoWalletAddressToTransactions(Scanner):
+
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[CryptoWallet]
+ OutputType = List[CryptoWalletTransaction]
+
def __init__(
self,
sketch_id: Optional[str] = None,
@@ -37,8 +37,12 @@ class CryptoWalletAddressToTransactions(Scanner):
)
@classmethod
- def requires_key(cls) -> bool:
+ def required_params(cls) -> bool:
return True
+
+ @classmethod
+ def icon(cls) -> str | None:
+ return "cryptowallet"
@classmethod
def get_params_schema(cls) -> List[Dict[str, Any]]:
@@ -71,32 +75,6 @@ class CryptoWalletAddressToTransactions(Scanner):
def key(cls) -> str:
return "address"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
@@ -262,4 +240,8 @@ class CryptoWalletAddressToTransactions(Scanner):
})
Logger.graph_append(self.sketch_id, {"message": f"Transaction on {datetime.fromtimestamp(int(tx.timestamp)).strftime('%Y-%m-%d %H:%M:%S') if tx.timestamp else 'Unknown time'}: {tx.source.address} -> {tx.target.address}"})
- return results
\ No newline at end of file
+ return results
+
+# Make types available at module level for easy access
+InputType = CryptoWalletAddressToTransactions.InputType
+OutputType = CryptoWalletAddressToTransactions.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/domains/domain_to_asn.py b/flowsint-api/app/scanners/domains/domain_to_asn.py
index 624d44a..0e375db 100644
--- a/flowsint-api/app/scanners/domains/domain_to_asn.py
+++ b/flowsint-api/app/scanners/domains/domain_to_asn.py
@@ -1,21 +1,20 @@
import json
import socket
import subprocess
-from typing import List, Dict, Any, TypeAlias, Union
-from pydantic import TypeAdapter
+from typing import List, Union
from app.scanners.base import Scanner
-from app.types.cidr import CIDR
from app.types.domain import Domain
from app.types.asn import ASN
-from app.utils import is_valid_domain, resolve_type
+from app.utils import is_valid_domain
from app.core.logger import Logger
-InputType: TypeAlias = List[Domain]
-OutputType: TypeAlias = List[ASN]
-
class DomainToAsnScanner(Scanner):
"""Takes a domain and returns its corresponding ASN."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Domain]
+ OutputType = List[ASN]
+
@classmethod
def name(cls) -> str:
return "domain_to_asn_scanner"
@@ -28,111 +27,59 @@ class DomainToAsnScanner(Scanner):
def key(cls) -> str:
return "Domain"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
domain_obj = None
if isinstance(item, str):
- domain_obj = Domain(domain=item)
+ if is_valid_domain(item):
+ domain_obj = Domain(domain=item)
elif isinstance(item, dict) and "domain" in item:
- domain_obj = Domain(domain=item["domain"])
+ if is_valid_domain(item["domain"]):
+ domain_obj = Domain(domain=item["domain"])
elif isinstance(item, Domain):
domain_obj = item
- if domain_obj and is_valid_domain(domain_obj.domain):
+ if domain_obj:
cleaned.append(domain_obj)
return cleaned
async def scan(self, data: InputType) -> OutputType:
- """Find ASN information for domain name using asnmap."""
- asns: OutputType = []
+ results: OutputType = []
for domain in data:
- asn_data = self.__get_asn_from_asnmap(domain.domain)
- if asn_data:
- Logger.info(self.sketch_id, {"message": f"Domain {domain.domain} has ASN {asn_data['as_number']}."})
- asns.append(ASN(
- number=int(asn_data["as_number"].lstrip("AS")),
- name=asn_data["as_name"],
- country=asn_data["as_country"],
- cidrs=[CIDR(network=cidr) for cidr in asn_data["as_range"]]
- ))
- else:
- Logger.info(self.sketch_id, {"message": f"No ASN found for domain {domain.domain}"})
- return asns
-
- def __get_asn_from_asnmap(self, domain: str) -> Dict[str, Any]:
- try:
- command = f"echo {domain} | asnmap -silent -json | jq"
- result = subprocess.run(
- command,
- shell=True,
- capture_output=True, text=True, timeout=60
- )
- if not result.stdout.strip():
- return None
- return json.loads(result.stdout)
- except Exception as e:
- Logger.error(self.sketch_id, {"message": f"asnmap exception for {domain}: {str(e)}"})
- return None
-
- def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
- # Create Neo4j relationships between domain and their corresponding ASNs
- for input_ip, result_asn in zip(original_input, results):
- # Skip if no valid ASN was found
- if result_asn.number == 0:
- continue
- Logger.graph_append(self.sketch_id, {"message": f"Domain {input_ip.domain} -> ASN {result_asn.number}"})
+ try:
+ # First resolve domain to IP
+ ip = socket.gethostbyname(domain.domain)
- query = """
- MERGE (domain:domain {domain: $domain})
- SET domain.sketch_id = $sketch_id,
- domain.label = $domain,
- domain.type = "domain"
-
- MERGE (asn:asn {number: $asn_number})
- SET asn.sketch_id = $sketch_id,
- asn.name = $asn_name,
- asn.country = $asn_country,
- asn.label = $asn_label,
- asn.type = "asn"
-
- MERGE (domain)-[:BELONGS_TO {sketch_id: $sketch_id}]->(asn)
- """
-
- if self.neo4j_conn:
- self.neo4j_conn.query(query, {
- "domain": input_ip.domain,
- "asn_number": result_asn.number,
- "asn_name": result_asn.name,
- "asn_country": result_asn.country,
- "asn_label": f"AS{result_asn.number}",
- "asn_caption": f"AS{result_asn.number} - {result_asn.name}",
- "sketch_id": self.sketch_id,
- })
+ # Use asnmap to get ASN info
+ result = subprocess.run(
+ ["asnmap", "-a", ip, "-json"],
+ capture_output=True,
+ text=True,
+ timeout=30
+ )
+
+ if result.returncode == 0:
+ output = result.stdout.strip()
+ if output:
+ asn_data = json.loads(output)
+ if asn_data and 'as_number' in asn_data:
+ asn = ASN(
+ asn=str(asn_data['as_number']),
+ name=asn_data.get('as_name', ''),
+ org=asn_data.get('as_org', ''),
+ country=asn_data.get('as_country', '')
+ )
+ results.append(asn)
+
+ except Exception as e:
+ Logger.error(self.sketch_id, {"message": f"Error getting ASN for domain {domain.domain}: {e}"})
+ continue
+
+ return results
- return results
\ No newline at end of file
+ def postprocess(self, results: OutputType, input_data: InputType = None) -> OutputType:
+ return results
+
+# Make types available at module level for easy access
+InputType = DomainToAsnScanner.InputType
+OutputType = DomainToAsnScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/domains/resolve.py b/flowsint-api/app/scanners/domains/resolve.py
index 6d05a8a..a5e4ec1 100644
--- a/flowsint-api/app/scanners/domains/resolve.py
+++ b/flowsint-api/app/scanners/domains/resolve.py
@@ -1,5 +1,5 @@
import socket
-from typing import List, Dict, Any, TypeAlias, Union
+from typing import List, Dict, Any, Union
from pydantic import TypeAdapter
from app.scanners.base import Scanner
from app.types.domain import Domain
@@ -9,11 +9,12 @@ import uuid
from app.types.transform import Node, Edge
from app.core.logger import Logger
-InputType: TypeAlias = List[Domain]
-OutputType: TypeAlias = List[Ip]
-
class ResolveScanner(Scanner):
"""Resolve domain names to IP addresses."""
+
+ # Define the input and output types as class attributes
+ InputType = List[Domain]
+ OutputType = List[Ip]
@classmethod
def name(cls) -> str:
@@ -27,32 +28,6 @@ class ResolveScanner(Scanner):
def key(cls) -> str:
return "domain"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
@@ -129,4 +104,8 @@ class ResolveScanner(Scanner):
Logger.graph_append(self.sketch_id, payload)
- return results
\ No newline at end of file
+ return results
+
+# Make types available at module level for easy access
+InputType = ResolveScanner.InputType
+OutputType = ResolveScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/domains/subdomains.py b/flowsint-api/app/scanners/domains/subdomains.py
index dff556c..a64ad77 100644
--- a/flowsint-api/app/scanners/domains/subdomains.py
+++ b/flowsint-api/app/scanners/domains/subdomains.py
@@ -1,18 +1,18 @@
import shutil
import requests
-import subprocess
-from typing import List, Dict, Any, TypeAlias, Union
+from typing import List, Union
from app.scanners.base import Scanner
-from app.types.domain import Domain, Domain
-from app.utils import is_valid_domain, resolve_type
-from pydantic import TypeAdapter
+from app.types.domain import Domain
+from app.utils import is_valid_domain
from app.core.logger import Logger
from app.tools.network.subfinder import SubfinderTool
-InputType: TypeAlias = List[Domain]
-OutputType: TypeAlias = List[Domain]
class SubdomainScanner(Scanner):
"""Scanner to find subdomains associated with a domain."""
+
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Domain | str]
+ OutputType = List[Domain]
@classmethod
def name(cls) -> str:
@@ -26,32 +26,6 @@ class SubdomainScanner(Scanner):
def key(cls) -> str:
return "domain"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
@@ -146,3 +120,7 @@ class SubdomainScanner(Scanner):
Logger.graph_append(self.sketch_id, {"message":f"{domain_obj['domain']} -> {len(domain_obj['subdomains'])} subdomain(s) found."})
return output
+
+
+InputType = SubdomainScanner.InputType
+OutputType = SubdomainScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/domains/to_website.py b/flowsint-api/app/scanners/domains/to_website.py
index 1347815..a23acea 100644
--- a/flowsint-api/app/scanners/domains/to_website.py
+++ b/flowsint-api/app/scanners/domains/to_website.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, TypeAlias, Union
+from typing import List, Dict, Any, Union
import requests
from app.utils import is_valid_domain, resolve_type
from app.scanners.base import Scanner
@@ -7,13 +7,13 @@ from app.types.website import Website
from pydantic import TypeAdapter
from app.core.logger import Logger
-InputType: TypeAlias = List[Domain]
-OutputType: TypeAlias = List[Website]
-
-
class DomainToWebsiteScanner(Scanner):
"""From domain to website."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Domain]
+ OutputType = List[Website]
+
@classmethod
def name(cls) -> str:
return "to_website"
@@ -26,95 +26,54 @@ class DomainToWebsiteScanner(Scanner):
def key(cls) -> str:
return "domain"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # Find the Website type in $defs
- website_def = schema["$defs"].get("Website")
- if not website_def:
- raise ValueError("Website type not found in schema")
- return {
- "type": "Website",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in website_def["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
domain_obj = None
if isinstance(item, str):
- domain_obj = Domain(domain=item)
+ if is_valid_domain(item):
+ domain_obj = Domain(domain=item)
elif isinstance(item, dict) and "domain" in item:
- domain_obj = Domain(domain=item["domain"])
+ if is_valid_domain(item["domain"]):
+ domain_obj = Domain(domain=item["domain"])
elif isinstance(item, Domain):
domain_obj = item
- if domain_obj and is_valid_domain(domain_obj.domain):
+ if domain_obj:
cleaned.append(domain_obj)
return cleaned
-
- def __is_site_active(self, url, timeout=5):
- try:
- session = requests.Session()
- response = session.get(url, timeout=timeout, allow_redirects=False)
-
- redirects = []
- current_url = url
-
- # Follow redirects manually to capture the chain
- while response.status_code in [301, 302, 303, 307, 308]:
- redirects.append(current_url)
- if 'Location' in response.headers:
- next_url = response.headers['Location']
- # Handle relative URLs
- if not next_url.startswith(('http://', 'https://')):
- from urllib.parse import urljoin
- next_url = urljoin(current_url, response.headers['Location'])
-
- current_url = next_url
- response = session.get(current_url, timeout=timeout, allow_redirects=False)
- else:
- break
-
- # Get the final response with redirects allowed
- final_response = requests.get(url, timeout=timeout, allow_redirects=True)
- return final_response.status_code == 200, final_response.url, redirects
- except requests.RequestException:
- return False, url, []
async def scan(self, data: InputType) -> OutputType:
- """To website"""
results: OutputType = []
- for d in data:
+ for domain in data:
try:
- initial_url = f"https://{d.domain}"
- is_active, final_url, redirects = self.__is_site_active(initial_url)
+ # Try HTTPS first
+ try:
+ https_url = f"https://{domain.domain}"
+ response = requests.head(https_url, timeout=10, allow_redirects=True)
+ if response.status_code < 400:
+ results.append(Website(url=https_url))
+ continue
+ except requests.RequestException:
+ pass
- # Use the last redirect URL as the main URL, or the final URL if no redirects
- main_url = redirects[-1] if redirects else final_url
-
- website = Website(url=main_url, redirects=redirects, domain=d, active=is_active)
- results.append(website)
+ # Try HTTP if HTTPS fails
+ try:
+ http_url = f"http://{domain.domain}"
+ response = requests.head(http_url, timeout=10, allow_redirects=True)
+ if response.status_code < 400:
+ results.append(Website(url=http_url))
+ continue
+ except requests.RequestException:
+ pass
+
+ # If both fail, still add HTTPS URL as default
+ results.append(Website(url=f"https://{domain.domain}"))
+
except Exception as e:
- print(e)
- continue
-
+ Logger.error(self.sketch_id, {"message": f"Error converting domain {domain.domain} to website: {e}"})
+ # Add HTTPS URL as fallback
+ results.append(Website(url=f"https://{domain.domain}"))
+
return results
def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
@@ -158,4 +117,8 @@ class DomainToWebsiteScanner(Scanner):
}
Logger.graph_append(self.sketch_id, payload)
- return results
\ No newline at end of file
+
+
+# Make types available at module level for easy access
+InputType = DomainToWebsiteScanner.InputType
+OutputType = DomainToWebsiteScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/domains/whois.py b/flowsint-api/app/scanners/domains/whois.py
index 9d7071c..007fc3c 100644
--- a/flowsint-api/app/scanners/domains/whois.py
+++ b/flowsint-api/app/scanners/domains/whois.py
@@ -1,5 +1,5 @@
import json
-from typing import List, Dict, Any, TypeAlias, Union
+from typing import List, Dict, Any, Union
import whois
from app.utils import is_valid_domain, resolve_type
from app.scanners.base import Scanner
@@ -9,13 +9,13 @@ from app.types.email import Email
from pydantic import TypeAdapter
from app.core.logger import Logger
-InputType: TypeAlias = List[Domain]
-OutputType: TypeAlias = List[Whois]
-
-
class WhoisScanner(Scanner):
"""Scan for WHOIS information of a domain."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Domain]
+ OutputType = List[Whois]
+
@classmethod
def name(cls) -> str:
return "to_whois"
@@ -28,70 +28,51 @@ class WhoisScanner(Scanner):
def key(cls) -> str:
return "domain"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # Find the Website type in $defs
- whois_def = schema["$defs"].get("Whois")
- if not whois_def:
- raise ValueError("Whois type not found in schema")
- return {
- "type": "Whois",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in whois_def["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
domain_obj = None
if isinstance(item, str):
- domain_obj = Domain(domain=item)
+ if is_valid_domain(item):
+ domain_obj = Domain(domain=item)
elif isinstance(item, dict) and "domain" in item:
- domain_obj = Domain(domain=item["domain"])
+ if is_valid_domain(item["domain"]):
+ domain_obj = Domain(domain=item["domain"])
elif isinstance(item, Domain):
domain_obj = item
- if domain_obj and is_valid_domain(domain_obj.domain):
+ if domain_obj:
cleaned.append(domain_obj)
return cleaned
async def scan(self, data: InputType) -> OutputType:
- """Extract WHOIS data for each domain."""
results: OutputType = []
- for d in data:
+ for domain in data:
try:
- w = whois.whois(d.domain)
- w_data = json.loads(json.dumps(w, default=str))
- whois_obj = Whois(
- registrar=w_data.get("registrar"),
- org=w_data.get("org"),
- city=w_data.get("city"),
- country=w_data.get("country"),
- email=Email(email=w_data["emails"][0]) if isinstance(w_data.get("emails"), list) else None,
- creation_date=str(w_data.get("creation_date")) if w_data.get("creation_date") else None,
- expiration_date=str(w_data.get("expiration_date")) if w_data.get("expiration_date") else None,
- )
- results.append({"whois": whois_obj, "domain": d.domain})
-
+ whois_info = whois.whois(domain.domain)
+ if whois_info:
+ # Extract emails from whois data
+ emails = []
+ if whois_info.emails:
+ if isinstance(whois_info.emails, list):
+ emails = [Email(email=email) for email in whois_info.emails if email]
+ else:
+ emails = [Email(email=whois_info.emails)]
+
+ whois_obj = Whois(
+ domain=domain.domain,
+ registrar=str(whois_info.registrar) if whois_info.registrar else None,
+ creation_date=whois_info.creation_date,
+ expiration_date=whois_info.expiration_date,
+ name_servers=whois_info.name_servers if whois_info.name_servers else [],
+ emails=emails,
+ raw_text=str(whois_info)
+ )
+ results.append(whois_obj)
+
except Exception as e:
- print(e)
+ Logger.error(self.sketch_id, {"message": f"Error getting WHOIS for domain {domain.domain}: {e}"})
continue
-
+
return results
def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
@@ -100,7 +81,6 @@ class WhoisScanner(Scanner):
continue
whois_obj = domain["whois"]
Logger.graph_append(self.sketch_id, {"message": f"WHOIS for {domain['domain']} -> registrar: {whois_obj.registrar} org: {whois_obj.org} city: {whois_obj.city} country: {whois_obj.country} creation_date: {whois_obj.creation_date} expiration_date: {whois_obj.expiration_date}"})
-
props = {
"domain": domain["domain"],
"registrar": whois_obj.registrar,
@@ -184,3 +164,10 @@ class WhoisScanner(Scanner):
})
return results
+
+
+
+
+# Make types available at module level for easy access
+InputType = WhoisScanner.InputType
+OutputType = WhoisScanner.OutputType
diff --git a/flowsint-api/app/scanners/emails/holehe.py b/flowsint-api/app/scanners/emails/holehe.py
deleted file mode 100644
index 10c2c3e..0000000
--- a/flowsint-api/app/scanners/emails/holehe.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from typing import Dict, Any, List, TypeAlias, Union
-from app.scanners.base import Scanner
-from app.types.email import Email
-from app.types.social import SocialProfile
-from pydantic import TypeAdapter
-from app.utils import is_valid_email, resolve_type
-import asyncio
-from app.core.logger import Logger
-InputType: TypeAlias = List[Email]
-OutputType: TypeAlias = List[SocialProfile]
-
-class HoleheScanner(Scanner):
- @classmethod
- def name(self) -> str:
- return "holehe_scanner"
-
- @classmethod
- def category(self) -> str:
- return "Email"
-
- @classmethod
- def key(self) -> str:
- return "email"
-
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info)}
- for prop, info in details["properties"].items()
- ]
- }
-
- def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
- cleaned: InputType = []
- for item in data:
- obj = None
- if isinstance(item, str):
- obj = Email(email=item)
- elif isinstance(item, dict) and "email" in item:
- obj = Email(email=item["email"])
- elif isinstance(item, Email):
- obj = item
-
- if obj and obj.email and is_valid_email(obj.email):
- cleaned.append(obj)
- return cleaned
-
- async def _perform_holehe_research(self, email: str) -> Dict[str, Any]:
- """
- Recherche asynchrone sur le réseau social et autres plateformes.
- """
- from holehe.modules.social_media import instagram, twitter, snapchat, bitmoji, crevado, discord, strava, imgur, myspace, fanpop, taringa, tellonym, tumblr, odnoklassniki, wattpad, xing, vsco
- from holehe.modules.shopping import amazon, ebay, deliveroo, garmin, vivino
- from holehe.modules.mails import google, yahoo, protonmail, mail_ru
- from holehe.modules.osint import rocketreach
- import httpx
-
- # Initialise le client httpx pour les requêtes HTTP asynchrones
- async with httpx.AsyncClient() as client:
- results = []
-
- modules = [
- amazon.amazon, google.google, yahoo.yahoo, protonmail.protonmail,
- instagram.instagram, twitter.twitter, snapchat.snapchat,
- rocketreach.rocketreach
- ]
-
- for module in modules:
- module_result = []
- try:
- await module(email, client, module_result)
- if module_result and module_result[0].get("exists") is not None:
- results.append(module_result[0])
- except Exception as e:
- results.append({"error": f"Error in {module.__name__}: {str(e)}"})
-
- return {"email": email, "results": results}
-
- async def scan(self, emails: List[str]) -> List[Dict[str, Any]]:
- """
- Effectue la recherche Holehe pour chaque email de la liste.
- """
- results = []
- for email in emails:
- found = []
- try:
- result = await self._perform_holehe_research(email)
- for result in result["results"]:
- if("error" not in result and "exists" in result):
- found.append(
- SocialProfile(
- username=email.email,
- profile_url=f"https://{result['domain']}",
- platform=result["name"]))
- except Exception as e:
- print(e)
- continue
- results.extend(found)
-
-
- return results
-
- def execute(self, values: List[str]) -> List[Dict[str, Any]]:
- preprocessed = self.preprocess(values)
- results = asyncio.run(self.scan(preprocessed))
- try:
- return self.postprocess(results, preprocessed)
- except TypeError as e:
- if "positional argument" in str(e) or "unexpected" in str(e):
- return self.postprocess(results)
- raise
-
-
- def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
- if not self.neo4j_conn:
- return results
-
- for profile in results:
- Logger.graph_append(self.sketch_id, {"message":f"{profile.username} -> account found on {profile.platform}"})
- self.neo4j_conn.query("""
- MERGE (p:social_profile {profile_url: $profile_url})
- SET p.platform = $platform,
- p.username = $username,
- p.label = $label,
- p.caption = $caption,
- p.type = $type,
- p.sketch_id = $sketch_id
-
- MERGE (i:email {email: $email})
- SET i.sketch_id = $sketch_id
- MERGE (i)-[:HAS_SOCIAL_ACCOUNT {sketch_id: $sketch_id}]->(p)
- """, {
- "profile_url": profile.profile_url,
- "username": profile.username,
- "platform": profile.platform,
- "label": f"{profile.platform}:{profile.username}",
- "caption": f"{profile.platform}:{profile.username}",
- "color": "#1DA1F2",
- "email": profile.username,
- "type": "social_profile",
- "sketch_id": self.sketch_id
- })
-
-
- return results
diff --git a/flowsint-api/app/scanners/emails/to_gravatar.py b/flowsint-api/app/scanners/emails/to_gravatar.py
index d3cb63b..e043a71 100644
--- a/flowsint-api/app/scanners/emails/to_gravatar.py
+++ b/flowsint-api/app/scanners/emails/to_gravatar.py
@@ -1,23 +1,16 @@
import hashlib
-from typing import List, Dict, Any, TypeAlias, Union
-from urllib.parse import urlparse
-
+from typing import List, Dict, Any, Union
import requests
-from app.utils import resolve_type
from app.scanners.base import Scanner
-from app.types.website import Website
-from app.types.domain import Domain
-from pydantic import TypeAdapter
from app.core.logger import Logger
from app.types.email import Email
from app.types.gravatar import Gravatar
-InputType: TypeAlias = List[Email]
-OutputType: TypeAlias = List[Gravatar]
-
-
class EmailToGravatarScanner(Scanner):
- """From email to gravatar."""
+ """From md5 hash of email to gravatar."""
+
+ InputType = List[Email]
+ OutputType = List[Gravatar]
@classmethod
def name(cls) -> str:
@@ -31,48 +24,14 @@ class EmailToGravatarScanner(Scanner):
def key(cls) -> str:
return "email"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- # Find the Email type in $defs
- website_def = schema["$defs"].get("Email")
- if not website_def:
- raise ValueError("Email type not found in schema")
- return {
- "type": "Email",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in website_def["properties"].items()
- ]
- }
-
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # Find the Gravatar type in $defs
- domain_def = schema["$defs"].get("Gravatar")
- if not domain_def:
- raise ValueError("Gravatar type not found in schema")
- return {
- "type": "Gravatar",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in domain_def["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
email_obj = None
if isinstance(item, str):
- # If it's a string, treat it as a str
email_obj = Email(email=item)
elif isinstance(item, dict) and "email" in item:
- email_obj = Email(**item)
+ email_obj = Email(email=item["email"])
elif isinstance(item, Email):
email_obj = item
if email_obj:
@@ -80,60 +39,51 @@ class EmailToGravatarScanner(Scanner):
return cleaned
async def scan(self, data: InputType) -> OutputType:
- """Fetch gravatar from email."""
results: OutputType = []
+
for email in data:
try:
- # MD5 hash of the email
- hash = hashlib.md5(email.email.encode()).hexdigest()
- url = f"https://www.gravatar.com/avatar/{hash}"
- response = requests.get(url)
+ # Generate MD5 hash of email
+ email_hash = hashlib.md5(email.email.lower().encode()).hexdigest()
+
+ # Query Gravatar API
+ gravatar_url = f"https://www.gravatar.com/avatar/{email_hash}?d=404"
+ response = requests.head(gravatar_url, timeout=10)
+
if response.status_code == 200:
- results.append(Gravatar(src=url, hash=hash))
- else:
- continue
+ # Gravatar found, get profile info
+ profile_url = f"https://www.gravatar.com/{email_hash}.json"
+ profile_response = requests.get(profile_url, timeout=10)
+
+ gravatar_data = {
+ "email": email.email,
+ "hash": email_hash,
+ "avatar_url": gravatar_url,
+ "profile_url": profile_url
+ }
+
+ if profile_response.status_code == 200:
+ profile_data = profile_response.json()
+ if "entry" in profile_data and profile_data["entry"]:
+ entry = profile_data["entry"][0]
+ gravatar_data.update({
+ "display_name": entry.get("displayName"),
+ "about_me": entry.get("aboutMe"),
+ "current_location": entry.get("currentLocation")
+ })
+
+ gravatar = Gravatar(**gravatar_data)
+ results.append(gravatar)
+
except Exception as e:
- Logger.info(self.sketch_id, {"message": f"No gravatar found for email {email.email}: {e}"})
+ Logger.error(self.sketch_id, {"message": f"Error checking Gravatar for email {email.email}: {e}"})
continue
-
+
return results
- def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
- for i, gravatar in enumerate(results):
- email = original_input[i] if i < len(original_input) else None
-
- query = """
- MERGE (g:gravatar {hash: $hash})
- SET g.sketch_id = $sketch_id,
- g.label = $src,
- g.type = "gravatar",
- g.src = $src
- """
- if email:
- query += """
- MERGE (e:email {email: $email})
- SET e.sketch_id = $sketch_id,
- e.label = $email,
- e.type = "email"
- MERGE (e)-[:HAS_GRAVATAR {sketch_id: $sketch_id}]->(g)
- """
-
- if self.neo4j_conn:
- params = {
- "hash": gravatar.hash,
- "src": str(gravatar.src),
- "sketch_id": self.sketch_id,
- }
- if email:
- params.update({
- "email": email.email,
- })
- self.neo4j_conn.query(query, params)
-
- email_address = email.email if email else "unknown"
- payload: Dict = {
- "message": f"{email_address} -> {gravatar.hash}"
- }
- Logger.graph_append(self.sketch_id, payload)
-
- return results
\ No newline at end of file
+ def postprocess(self, results: OutputType, input_data: InputType = None) -> OutputType:
+ return results
+
+# Make types available at module level for easy access
+InputType = EmailToGravatarScanner.InputType
+OutputType = EmailToGravatarScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/emails/to_leaks.py b/flowsint-api/app/scanners/emails/to_leaks.py
index c959bf3..24ff06a 100644
--- a/flowsint-api/app/scanners/emails/to_leaks.py
+++ b/flowsint-api/app/scanners/emails/to_leaks.py
@@ -1,14 +1,8 @@
-import hashlib
import os
-from typing import List, Dict, Any, TypeAlias, Union
-from urllib.parse import urlparse
-
+from typing import Any, Dict, List, Union
import requests
-from app.utils import resolve_type
+from urllib.parse import urljoin
from app.scanners.base import Scanner
-from app.types.website import Website
-from app.types.domain import Domain
-from pydantic import TypeAdapter
from app.core.logger import Logger
from app.types.email import Email
from app.types.breach import Breach
@@ -19,12 +13,11 @@ load_dotenv()
HIBP_API_KEY = os.getenv("HIBP_API_KEY")
-InputType: TypeAlias = List[Email]
-OutputType: TypeAlias = List[Breach]
-
-
class EmailToBreachesScanner(Scanner):
- """From email to breaches."""
+ """From email to breaches using Have I Been Pwned API."""
+
+ InputType = List[Email]
+ OutputType = List[Breach]
@classmethod
def name(cls) -> str:
@@ -37,49 +30,38 @@ class EmailToBreachesScanner(Scanner):
@classmethod
def key(cls) -> str:
return "email"
-
+
@classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- # Find the Email type in $defs
- website_def = schema["$defs"].get("Email")
- if not website_def:
- raise ValueError("Email type not found in schema")
- return {
- "type": "Email",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in website_def["properties"].items()
- ]
- }
-
-
+ def required_params(cls) -> bool:
+ return True
+
@classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # Find the Breach type in $defs
- breach_def = schema["$defs"].get("Breach")
- if not breach_def:
- raise ValueError("Breach type not found in schema")
- return {
- "type": "Breach",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in breach_def["properties"].items()
- ]
- }
+ def get_params_schema(cls) -> List[Dict[str, Any]]:
+ """Declare required parameters for this scanner"""
+ return [
+ {
+ "name": "HIBP_API_KEY",
+ "type": "vaultSecret",
+ "description": "The HIBP API key to use for breaches lookup.",
+ "required": True
+ },
+ {
+ "name": "HIBP_API_URL",
+ "type": "url",
+ "description": "The HIBP API URL to use for breaches lookup.",
+ "required": False,
+ "default": "https://haveibeenpwned.com/api/v3/breachedaccount/"
+ }
+ ]
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
email_obj = None
if isinstance(item, str):
- # If it's a string, treat it as a str
email_obj = Email(email=item)
elif isinstance(item, dict) and "email" in item:
- email_obj = Email(**item)
+ email_obj = Email(email=item["email"])
elif isinstance(item, Email):
email_obj = item
if email_obj:
@@ -87,139 +69,65 @@ class EmailToBreachesScanner(Scanner):
return cleaned
async def scan(self, data: InputType) -> OutputType:
- """Fetch breaches from email using HaveIBeenPwned API."""
results: OutputType = []
- if not HIBP_API_KEY:
- raise ValueError("HIBP_API_KEY not set for this account. Usr the Vault to set your haveibeenpwned key. ")
+ api_key = self.resolve_params()["HIBP_API_KEY"]
+ api_url = self.resolve_params()["HIBP_API_URL"]
+ if not api_key:
+ Logger.error(self.sketch_id, {"message": "A valid HIBP_API_KEY is required to scan for breaches."})
+ if not api_url:
+ Logger.error(self.sketch_id, {"message": "Could not find HIBP_API_URL in params."})
+ headers = {
+ "hibp-api-key": api_key,
+ "User-Agent": "FlowsInt-Scanner"
+ }
+ Logger.info(self.sketch_id, {"message": f"HIBP API key: {api_key}"})
+ Logger.info(self.sketch_id, {"message": f"HIBP API URL: {api_url}"})
for email in data:
try:
- url = f"https://haveibeenpwned.com/api/v3/breachedaccount/{email.email}?truncateResponse=false"
- headers = {"hibp-api-key": HIBP_API_KEY} if HIBP_API_KEY else {}
- response = requests.get(url, headers=headers)
+ # Query Have I Been Pwned API
+ full_url = urljoin(api_url, email.email)
+ response = requests.get(full_url, headers=headers, timeout=10)
+ Logger.info(self.sketch_id, {"message": f"HIBP API response: {response.json()}"})
if response.status_code == 200:
breaches_data = response.json()
- # Create a Breach object for each breach in the response
- for breach_item in breaches_data:
- # Lowercase all keys for the model
- breach_item_lc = {k.lower(): v for k, v in breach_item.items()}
- name_value = breach_item.get("Name")
- name = name_value.lower() if name_value else "unknown"
- # Lowercase the value of the 'name' key in the breach dict as well
- if "name" in breach_item_lc and breach_item_lc["name"]:
- breach_item_lc["name"] = breach_item_lc["name"].lower()
+ for breach_data in breaches_data:
breach = Breach(
- name=name,
- **{k: breach_item_lc.get(k) for k in Breach.model_fields.keys() if k not in ("breach", "name")},
- breach=breach_item_lc
+ name=breach_data.get("Name", ""),
+ title=breach_data.get("Title", ""),
+ domain=breach_data.get("Domain", ""),
+ breach_date=breach_data.get("BreachDate", ""),
+ added_date=breach_data.get("AddedDate", ""),
+ modified_date=breach_data.get("ModifiedDate", ""),
+ pwn_count=breach_data.get("PwnCount", 0),
+ description=breach_data.get("Description", ""),
+ data_classes=breach_data.get("DataClasses", []),
+ is_verified=breach_data.get("IsVerified", False),
+ is_fabricated=breach_data.get("IsFabricated", False),
+ is_sensitive=breach_data.get("IsSensitive", False),
+ is_retired=breach_data.get("IsRetired", False),
+ is_spam_list=breach_data.get("IsSpamList", False),
+ logo_path=breach_data.get("LogoPath", "")
)
results.append(breach)
- else:
+
+ elif response.status_code == 404:
+ # No breaches found for this email
+ Logger.info(self.sketch_id, {"message": f"No breaches found for email {email.email}"})
continue
+
+ else:
+ Logger.error(self.sketch_id, {"message": f"HIBP API error for {email.email}: {response.status_code}"})
+ continue
+
except Exception as e:
- Logger.info(self.sketch_id, {"message": f"No breach found for email {email.email}: {e}"})
+ Logger.error(self.sketch_id, {"message": f"Error checking breaches for email {email.email}: {e}"})
continue
-
+
return results
- def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
- # Create a mapping of email to breaches
- email_to_breaches = {}
- for i, breach in enumerate(results):
- # Find the corresponding email (assuming one email can have multiple breaches)
- # We need to track which email this breach belongs to
- # For now, we'll use the first email if we have multiple breaches
- email_index = min(i, len(original_input) - 1) if original_input else None
- email = original_input[email_index] if email_index is not None else None
-
- if email:
- if email.email not in email_to_breaches:
- email_to_breaches[email.email] = []
- email_to_breaches[email.email].append(breach)
-
- # Create breach node with all properties
- query = """
- MERGE (b:breach {name: $name})
- SET b.sketch_id = $sketch_id,
- b.label = $name,
- b.type = "breach",
- b.title = $title,
- b.domain = $domain,
- b.breachdate = $breachdate,
- b.addeddate = $addeddate,
- b.modifieddate = $modifieddate,
- b.pwncount = $pwncount,
- b.description = $description,
- b.src = $logopath,
- b.dataclasses = $dataclasses,
- b.isverified = $isverified,
- b.isfabricated = $isfabricated,
- b.issensitive = $issensitive,
- b.isretired = $isretired,
- b.isspamlist = $isspamlist,
- b.ismalware = $ismalware,
- b.isstealerlog = $isstealerlog,
- b.issubscriptionfree = $issubscriptionfree
- """
-
- if self.neo4j_conn:
- params = {
- "name": breach.name,
- "sketch_id": self.sketch_id,
- "title": breach.title,
- "domain": breach.domain,
- "breachdate": breach.breachdate,
- "addeddate": breach.addeddate,
- "modifieddate": breach.modifieddate,
- "pwncount": breach.pwncount,
- "description": breach.description,
- "logopath": breach.logopath,
- "dataclasses": breach.dataclasses,
- "isverified": breach.isverified,
- "isfabricated": breach.isfabricated,
- "issensitive": breach.issensitive,
- "isretired": breach.isretired,
- "isspamlist": breach.isspamlist,
- "ismalware": breach.ismalware,
- "isstealerlog": breach.isstealerlog,
- "issubscriptionfree": breach.issubscriptionfree,
- }
- self.neo4j_conn.query(query, params)
-
- # Create email nodes and relationships
- for email_email, breaches in email_to_breaches.items():
- email_query = """
- MERGE (e:email {email: $email})
- SET e.sketch_id = $sketch_id,
- e.label = $email,
- e.type = "email"
- """
-
- if self.neo4j_conn:
- email_params = {
- "email": email_email,
- "sketch_id": self.sketch_id,
- }
- self.neo4j_conn.query(email_query, email_params)
-
- # Create relationships for each breach
- for breach in breaches:
- rel_query = """
- MATCH (e:email {email: $email})
- MATCH (b:breach {name: $breach_name})
- MERGE (e)-[:HAS_BREACH {sketch_id: $sketch_id}]->(b)
- """
-
- if self.neo4j_conn:
- rel_params = {
- "email": email_email,
- "breach_name": breach.name,
- "sketch_id": self.sketch_id,
- }
- self.neo4j_conn.query(rel_query, rel_params)
-
- payload: Dict = {
- "message": f"{email_email} -> {breach.name}"
- }
- Logger.graph_append(self.sketch_id, payload)
-
- return results
\ No newline at end of file
+ def postprocess(self, results: OutputType, input_data: InputType = None) -> OutputType:
+ return results
+
+# Make types available at module level for easy access
+InputType = EmailToBreachesScanner.InputType
+OutputType = EmailToBreachesScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/individuals/to_org.py b/flowsint-api/app/scanners/individuals/to_org.py
index a8e8e05..12938af 100644
--- a/flowsint-api/app/scanners/individuals/to_org.py
+++ b/flowsint-api/app/scanners/individuals/to_org.py
@@ -1,20 +1,18 @@
import requests
-from typing import List, Dict, Any, TypeAlias, Union
-from pydantic import TypeAdapter
+from typing import List, Dict, Any, Union
from app.scanners.base import Scanner
from app.types.organization import Organization
from app.types.individual import Individual
-from app.utils import resolve_type
from app.core.logger import Logger
from app.tools.organizations.sirene import SireneTool
-
-InputType: TypeAlias = List[Individual]
-OutputType: TypeAlias = List[Organization]
-
class IndividualToOrgScanner(Scanner):
"""Find organization from a person with data from SIRENE (France only)."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Individual]
+ OutputType = List[Organization]
+
@classmethod
def name(cls) -> str:
return "to_org"
@@ -27,45 +25,6 @@ class IndividualToOrgScanner(Scanner):
def key(cls) -> str:
return "fullname"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- # Find the Organization type in $defs
- organization_def = schema["$defs"].get("Individual")
- if not organization_def:
- raise ValueError("Individual type not found in schema")
- return {
- "type": "Individual",
- "properties": [
- {"name": "fullname", "type": "string"}
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # the items property contains the Organization type reference
- items_schema = schema.get("items", {})
- if "$ref" in items_schema:
- # Extract the type name from the $ref (e.g., "#/$defs/Organization" -> "Organization")
- ref_path = items_schema["$ref"]
- type_name = ref_path.split("/")[-1]
- organization_def = schema["$defs"].get(type_name)
- if not organization_def:
- raise ValueError(f"Type {type_name} not found in schema")
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in organization_def["properties"].items()
- ]
- }
- else:
- raise ValueError("Expected $ref in items schema for List type")
-
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
if not isinstance(data, list):
raise ValueError(f"Expected list input, got {type(data).__name__}")
@@ -88,17 +47,17 @@ class IndividualToOrgScanner(Scanner):
async def scan(self, data: InputType) -> OutputType:
results: OutputType = []
- for indiv in data:
+ for individual in data:
try:
sirene = SireneTool()
- raw_orgs = sirene.launch(f'{indiv.first_name}+{indiv.last_name}', limit=25)
+ raw_orgs = sirene.launch(individual.full_name, limit=25)
if len(raw_orgs)> 0:
for org_dict in raw_orgs:
enriched_org = self.enrich_org(org_dict)
if enriched_org is not None:
results.append(enriched_org)
except Exception as e:
- continue
+ Logger.error(self.sketch_id, {"message": f"Error finding organization for {individual.full_name}: {e}"})
return results
def enrich_org(self, company: Dict) -> Organization:
@@ -121,7 +80,6 @@ class IndividualToOrgScanner(Scanner):
# Extract dirigeants and convert to Individual objects
dirigeants = []
for dirigeant_data in company.get("dirigeants", []):
- from app.types.individual import Individual
dirigeant = Individual(
first_name=dirigeant_data.get("prenoms", ""),
last_name=dirigeant_data.get("nom", ""),
@@ -272,26 +230,10 @@ class IndividualToOrgScanner(Scanner):
self.neo4j_conn.query("""
MERGE (o:Organization {name: $name, country: $country})
SET o.siren = $siren,
+ o.siege_siret = $siret,
o.nom_complet = $nom_complet,
o.nom_raison_sociale = $nom_raison_sociale,
o.sigle = $sigle,
- o.nombre_etablissements = $nombre_etablissements,
- o.nombre_etablissements_ouverts = $nombre_etablissements_ouverts,
- o.activite_principale = $activite_principale,
- o.section_activite_principale = $section_activite_principale,
- o.categorie_entreprise = $categorie_entreprise,
- o.annee_categorie_entreprise = $annee_categorie_entreprise,
- o.caractere_employeur = $caractere_employeur,
- o.tranche_effectif_salarie = $tranche_effectif_salarie,
- o.annee_tranche_effectif_salarie = $annee_tranche_effectif_salarie,
- o.date_creation = $date_creation,
- o.date_fermeture = $date_fermeture,
- o.date_mise_a_jour = $date_mise_a_jour,
- o.date_mise_a_jour_insee = $date_mise_a_jour_insee,
- o.date_mise_a_jour_rne = $date_mise_a_jour_rne,
- o.nature_juridique = $nature_juridique,
- o.etat_administratif = $etat_administratif,
- o.statut_diffusion = $statut_diffusion,
o.sketch_id = $sketch_id,
o.label = $name,
o.caption = $name,
@@ -300,133 +242,19 @@ class IndividualToOrgScanner(Scanner):
"name": org.name,
"country": "FR",
"siren": org.siren,
+ "siret": org.siege_siret,
"nom_complet": org.nom_complet,
"nom_raison_sociale": org.nom_raison_sociale,
"sigle": org.sigle,
- "nombre_etablissements": org.nombre_etablissements,
- "nombre_etablissements_ouverts": org.nombre_etablissements_ouverts,
- "activite_principale": org.activite_principale,
- "section_activite_principale": org.section_activite_principale,
- "categorie_entreprise": org.categorie_entreprise,
- "annee_categorie_entreprise": org.annee_categorie_entreprise,
- "caractere_employeur": org.caractere_employeur,
- "tranche_effectif_salarie": org.tranche_effectif_salarie,
- "annee_tranche_effectif_salarie": org.annee_tranche_effectif_salarie,
- "date_creation": org.date_creation,
- "date_fermeture": org.date_fermeture,
- "date_mise_a_jour": org.date_mise_a_jour,
- "date_mise_a_jour_insee": org.date_mise_a_jour_insee,
- "date_mise_a_jour_rne": org.date_mise_a_jour_rne,
- "nature_juridique": org.nature_juridique,
- "etat_administratif": org.etat_administratif,
- "statut_diffusion": org.statut_diffusion,
"sketch_id": self.sketch_id,
})
- # Add SIREN as identifier if available
- if org.siren:
- Logger.graph_append(self.sketch_id, {"message": f"{org.name}: SIREN {org.siren} -> {org.name}"})
-
- # Add SIRET as identifier if available
- if org.siege_siret:
- Logger.graph_append(self.sketch_id, {"message": f"{org.name}: SIRET {org.siege_siret} -> {org.name}"})
-
- # Add dirigeants (leaders) as Individual nodes with relationships
- if org.dirigeants:
- for dirigeant in org.dirigeants:
- self.neo4j_conn.query("""
- MERGE (i:Individual {full_name: $full_name})
- SET i.first_name = $first_name,
- i.last_name = $last_name,
- i.birth_date = $birth_date,
- i.gender = $gender,
- i.sketch_id = $sketch_id,
- i.label = $full_name,
- i.caption = $full_name,
- i.type = 'individual'
- WITH i
- MATCH (o:Organization {name: $org_name, country: $org_country})
- MERGE (o)-[:HAS_LEADER {sketch_id: $sketch_id}]->(i)
- """, {
- "full_name": dirigeant.full_name,
- "first_name": dirigeant.first_name,
- "last_name": dirigeant.last_name,
- "birth_date": dirigeant.birth_date,
- "gender": dirigeant.gender,
- "sketch_id": self.sketch_id,
- "org_name": org.name,
- "org_country": "FR",
- })
- Logger.graph_append(self.sketch_id, {"message": f"{org.name}: HAS_LEADER -> {dirigeant.full_name}"})
-
- # Add siege address as PhysicalAddress node if available
- if org.siege_geo_adresse:
- address = org.siege_geo_adresse
- self.neo4j_conn.query("""
- MERGE (a:PhysicalAddress {address: $address, city: $city, country: $country})
- SET a.zip = $zip,
- a.latitude = $latitude,
- a.longitude = $longitude,
- a.sketch_id = $sketch_id,
- a.label = $label,
- a.caption = $caption,
- a.type = 'location'
- WITH a
- MATCH (o:Organization {name: $org_name, country: $org_country})
- MERGE (o)-[:HAS_ADDRESS {sketch_id: $sketch_id}]->(a)
- """, {
- "address": address.address,
- "city": address.city,
- "country": address.country,
- "zip": address.zip,
- "latitude": address.latitude,
- "longitude": address.longitude,
- "sketch_id": self.sketch_id,
- "label": f"{address.address}, {address.city}",
- "caption": f"{address.address}, {address.city}",
- "org_name": org.name,
- "org_country": "FR",
- })
- Logger.graph_append(self.sketch_id, {"message": f"{org.name}: HAS_ADDRESS -> {address.address}, {address.city}"})
-
- # Add siege location as Location node if coordinates are available but no PhysicalAddress
- elif org.siege_latitude and org.siege_longitude:
- self.neo4j_conn.query("""
- MERGE (l:Location {latitude: $latitude, longitude: $longitude})
- SET l.address = $address,
- l.city = $city,
- l.country = $country,
- l.zip = $zip,
- l.sketch_id = $sketch_id,
- l.label = $label,
- l.caption = $caption,
- l.type = 'location'
- WITH l
- MATCH (o:Organization {name: $org_name, country: $org_country})
- MERGE (o)-[:LOCATED_AT {sketch_id: $sketch_id}]->(l)
- """, {
- "latitude": float(org.siege_latitude),
- "longitude": float(org.siege_longitude),
- "address": org.siege_adresse,
- "city": org.siege_libelle_commune,
- "country": "FR",
- "zip": org.siege_code_postal,
- "sketch_id": self.sketch_id,
- "label": f"{org.siege_adresse or 'Unknown'}, {org.siege_libelle_commune or 'Unknown'}",
- "caption": f"{org.siege_adresse or 'Unknown'}, {org.siege_libelle_commune or 'Unknown'}",
- "org_name": org.name,
- "org_country": "FR",
- })
- Logger.graph_append(self.sketch_id, {"message": f"{org.name}: LOCATED_AT -> {org.siege_libelle_commune or 'Unknown'}"})
-
- # Add activity codes as Activity nodes
- if org.activite_principale:
- Logger.graph_append(self.sketch_id, {"message": f"{org.name}: HAS_ACTIVITY -> {org.activite_principale}"})
-
- # Add legal nature as LegalNature node
- if org.nature_juridique:
- Logger.graph_append(self.sketch_id, {"message": f"{org.name}: HAS_LEGAL_NATURE -> {org.nature_juridique}"})
+ Logger.graph_append(self.sketch_id, {"message": f"Found organization: {org.name}"})
return results
+# Make types available at module level for easy access
+InputType = IndividualToOrgScanner.InputType
+OutputType = IndividualToOrgScanner.OutputType
+
diff --git a/flowsint-api/app/scanners/ips/asn_to_cidrs.py b/flowsint-api/app/scanners/ips/asn_to_cidrs.py
index 967921e..33cf2d2 100644
--- a/flowsint-api/app/scanners/ips/asn_to_cidrs.py
+++ b/flowsint-api/app/scanners/ips/asn_to_cidrs.py
@@ -1,21 +1,21 @@
import json
import socket
import subprocess
-from typing import List, Dict, Any, TypeAlias, Union
-from pydantic import TypeAdapter
+from typing import List, Dict, Any, Union
from app.scanners.base import Scanner
from app.types.cidr import CIDR
from app.types.ip import Ip
from app.types.asn import ASN
-from app.utils import is_valid_asn, parse_asn, resolve_type
+from app.utils import is_valid_asn, parse_asn
from app.core.logger import Logger
-InputType: TypeAlias = List[ASN]
-OutputType: TypeAlias = List[CIDR]
-
class AsnToCidrsScanner(Scanner):
"""Takes an ASN and returns its corresponding CIDRs."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[ASN]
+ OutputType = List[CIDR]
+
@classmethod
def name(cls) -> str:
return "asn_to_cidrs_scanner"
@@ -24,32 +24,6 @@ class AsnToCidrsScanner(Scanner):
def category(cls) -> str:
return "Asn"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[int], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
@@ -205,4 +179,8 @@ class AsnToCidrsScanner(Scanner):
"cidr_network": str(cidr.network),
"sketch_id": self.sketch_id,
})
- return results
\ No newline at end of file
+ return results
+
+# Make types available at module level for easy access
+InputType = AsnToCidrsScanner.InputType
+OutputType = AsnToCidrsScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/ips/cidr_to_ips.py b/flowsint-api/app/scanners/ips/cidr_to_ips.py
index a0a1bb2..d77a4ec 100644
--- a/flowsint-api/app/scanners/ips/cidr_to_ips.py
+++ b/flowsint-api/app/scanners/ips/cidr_to_ips.py
@@ -1,18 +1,17 @@
import subprocess
-from typing import List, Dict, Any, TypeAlias, Union
-from pydantic import TypeAdapter
+from typing import List, Dict, Any, Union
from app.scanners.base import Scanner
from app.types.cidr import CIDR
from app.types.ip import Ip
-from app.utils import resolve_type
from app.core.logger import Logger
-InputType: TypeAlias = List[CIDR]
-OutputType: TypeAlias = List[Ip]
-
class CidrToIpsScanner(Scanner):
"""Takes a CIDR and returns its corresponding IP addresses."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[CIDR]
+ OutputType = List[Ip]
+
@classmethod
def name(cls) -> str:
return "cidr_to_ips_scanner"
@@ -21,32 +20,6 @@ class CidrToIpsScanner(Scanner):
def category(cls) -> str:
return "Cidr"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
@@ -128,4 +101,8 @@ class CidrToIpsScanner(Scanner):
"ip_address": ip.address,
"sketch_id": self.sketch_id,
})
- return results
\ No newline at end of file
+ return results
+
+# Make types available at module level for easy access
+InputType = CidrToIpsScanner.InputType
+OutputType = CidrToIpsScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/ips/geolocation.py b/flowsint-api/app/scanners/ips/geolocation.py
index 50bfb1d..de335e5 100644
--- a/flowsint-api/app/scanners/ips/geolocation.py
+++ b/flowsint-api/app/scanners/ips/geolocation.py
@@ -12,6 +12,11 @@ OutputType: TypeAlias = List[Ip]
class GeolocationScanner(Scanner):
"""Get geolocation data for IP addresses."""
+
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Ip]
+ OutputType = List[Ip]
+
@classmethod
def name(cls) -> str:
return "ip_geolocation_scanner"
@@ -128,3 +133,6 @@ class GeolocationScanner(Scanner):
except Exception as e:
print(f"Failed to geolocate {address}: {e}")
return {}
+
+InputType = GeolocationScanner.InputType
+OutputType = GeolocationScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/ips/ip_to_asn.py b/flowsint-api/app/scanners/ips/ip_to_asn.py
index f0d88aa..32376a7 100644
--- a/flowsint-api/app/scanners/ips/ip_to_asn.py
+++ b/flowsint-api/app/scanners/ips/ip_to_asn.py
@@ -1,7 +1,7 @@
import json
import socket
import subprocess
-from typing import List, Dict, Any, TypeAlias, Union
+from typing import List, Dict, Any, Union
from pydantic import TypeAdapter
from app.scanners.base import Scanner
from app.types.cidr import CIDR
@@ -10,11 +10,12 @@ from app.types.asn import ASN
from app.utils import is_valid_ip, resolve_type
from app.core.logger import Logger
-InputType: TypeAlias = List[Ip]
-OutputType: TypeAlias = List[ASN]
-
class IpToAsnScanner(Scanner):
- """Takes an IP addreses and returns its corresponding ASN."""
+ """Takes an IP address and returns its corresponding ASN."""
+
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Ip]
+ OutputType = List[ASN]
@classmethod
def name(cls) -> str:
@@ -28,144 +29,57 @@ class IpToAsnScanner(Scanner):
def key(cls) -> str:
return "address"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
ip_obj = None
if isinstance(item, str):
- ip_obj = Ip(address=item)
+ if is_valid_ip(item):
+ ip_obj = Ip(address=item)
elif isinstance(item, dict) and "address" in item:
- ip_obj = Ip(address=item["address"])
+ if is_valid_ip(item["address"]):
+ ip_obj = Ip(address=item["address"])
elif isinstance(item, Ip):
ip_obj = item
- if ip_obj and is_valid_ip(ip_obj.address):
+ if ip_obj:
cleaned.append(ip_obj)
return cleaned
async def scan(self, data: InputType) -> OutputType:
- """Find ASN information for IP addresses using asnmap."""
- asns: OutputType = []
-
+ results: OutputType = []
+
for ip in data:
- asn_data = self.__get_asn_from_asnmap(ip.address)
- if asn_data:
- Logger.info(self.sketch_id, {"message": f"IP {ip.address} has ASN {asn_data['as_number']}."})
- asns.append(ASN(
- number=int(asn_data["as_number"].lstrip("AS")),
- name=asn_data["as_name"],
- country=asn_data["as_country"],
- cidrs=[CIDR(network=cidr) for cidr in asn_data["as_range"]]
- ))
- else:
- Logger.info(self.sketch_id, {"message": f"No ASN found for IP {ip.address}"})
- return asns
-
- def __get_asn_from_asnmap(self, ip: str) -> Dict[str, Any]:
- try:
- # Properly run the shell pipeline using shell=True
- command = f"echo {ip} | asnmap -silent -json | jq -s '.'"
- result = subprocess.run(
- command,
- shell=True,
- capture_output=True, text=True, timeout=60
- )
- if not result.stdout.strip():
- Logger.info(self.sketch_id, {"message": f"No ASN found for {ip}."})
- return None
try:
- # Parse the JSON array
- data_array = json.loads(result.stdout)
- if not data_array:
- return None
-
- combined_data = {
- "as_number": None,
- "as_name": None,
- "as_country": None,
- "as_range": []
- }
-
- for data in data_array:
- if data.get("as_number") and not combined_data["as_number"]:
- combined_data["as_number"] = data["as_number"]
- if data.get("as_name") and not combined_data["as_name"]:
- combined_data["as_name"] = data["as_name"]
- if data.get("as_country") and not combined_data["as_country"]:
- combined_data["as_country"] = data["as_country"]
- if "as_range" in data:
- combined_data["as_range"].extend(data["as_range"])
-
- return combined_data if combined_data["as_number"] else None
-
- except json.JSONDecodeError:
- Logger.error(self.sketch_id, {"message": f"Failed to parse JSON from asnmap output: {result.stdout}"})
- return None
-
- except Exception as e:
- Logger.error(self.sketch_id, {"message": f"asnmap exception for {ip}: {str(e)}"})
- return None
-
- def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
- # Create Neo4j relationships between IP addresses and their corresponding ASNs
- for input_ip, result_asn in zip(original_input, results):
- Logger.graph_append(self.sketch_id, {"message": f"IP {input_ip.address} -> ASN {result_asn.number}"})
- # Skip if no valid ASN was found
- if result_asn.number == 0:
+ # Use asnmap to get ASN info
+ result = subprocess.run(
+ ["asnmap", "-a", ip.address, "-json"],
+ capture_output=True,
+ text=True,
+ timeout=30
+ )
+
+ if result.returncode == 0:
+ output = result.stdout.strip()
+ if output:
+ asn_data = json.loads(output)
+ if asn_data and 'as_number' in asn_data:
+ asn = ASN(
+ asn=str(asn_data['as_number']),
+ name=asn_data.get('as_name', ''),
+ org=asn_data.get('as_org', ''),
+ country=asn_data.get('as_country', '')
+ )
+ results.append(asn)
+
+ except Exception as e:
+ Logger.error(self.sketch_id, {"message": f"Error getting ASN for IP {ip.address}: {e}"})
continue
- query = """
- MERGE (ip:ip {address: $ip_address})
- SET ip.sketch_id = $sketch_id,
- ip.label = $ip_address,
- ip.caption = $ip_address,
- ip.type = "ip"
-
- MERGE (asn:asn {number: $asn_number})
- SET asn.sketch_id = $sketch_id,
- asn.name = $asn_name,
- asn.country = $asn_country,
- asn.label = $asn_label,
- asn.caption = $asn_caption,
- asn.type = "asn"
-
- MERGE (ip)-[:BELONGS_TO {sketch_id: $sketch_id}]->(asn)
- """
-
- if self.neo4j_conn:
- self.neo4j_conn.query(query, {
- "ip_address": input_ip.address,
- "asn_number": result_asn.number,
- "asn_name": result_asn.name,
- "asn_country": result_asn.country,
- "asn_label": f"AS{result_asn.number}",
- "asn_caption": f"AS{result_asn.number} - {result_asn.name}",
- "sketch_id": self.sketch_id,
- })
+ return results
- return results
\ No newline at end of file
+ def postprocess(self, results: OutputType, input_data: InputType = None) -> OutputType:
+ return results
+
+# Make types available at module level for easy access
+InputType = IpToAsnScanner.InputType
+OutputType = IpToAsnScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/ips/reverse_resolve.py b/flowsint-api/app/scanners/ips/reverse_resolve.py
index df5c503..3a9484e 100644
--- a/flowsint-api/app/scanners/ips/reverse_resolve.py
+++ b/flowsint-api/app/scanners/ips/reverse_resolve.py
@@ -3,7 +3,7 @@ import os
import socket
import dns.resolver
import requests
-from typing import List, Dict, Any, TypeAlias, Union
+from typing import List, Dict, Any, Union
from pydantic import TypeAdapter
from app.core.logger import Logger
from app.scanners.base import Scanner
@@ -11,14 +11,15 @@ from app.types.domain import Domain
from app.types.ip import Ip
from app.utils import resolve_type, is_valid_ip
-InputType: TypeAlias = List[Ip]
-OutputType: TypeAlias = List[Domain]
-
PTR_BLACKLIST = re.compile(r"^ip\d+\.ip-\d+-\d+-\d+-\d+\.")
class ReverseResolveScanner(Scanner):
"""Resolve IP addresses to domain names using PTR, Certificate Transparency and optional API calls."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Ip]
+ OutputType = List[Domain]
+
@classmethod
def name(cls) -> str:
return "ip_reverse_resolve_scanner"
@@ -31,141 +32,61 @@ class ReverseResolveScanner(Scanner):
def key(cls) -> str:
return "address"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
ip_obj = None
if isinstance(item, str):
- ip_obj = Ip(address=item)
+ if is_valid_ip(item):
+ ip_obj = Ip(address=item)
elif isinstance(item, dict) and "address" in item:
- ip_obj = Ip(address=item["address"])
+ if is_valid_ip(item["address"]):
+ ip_obj = Ip(address=item["address"])
elif isinstance(item, Ip):
ip_obj = item
- if ip_obj and is_valid_ip(ip_obj.address):
+ if ip_obj:
cleaned.append(ip_obj)
return cleaned
async def scan(self, data: InputType) -> OutputType:
results: OutputType = []
+
for ip in data:
try:
- domains = self.get_domains_from_ip(ip.address)
- for d in domains:
- results.append(Domain(domain=d))
+ # Try PTR lookup
+ try:
+ hostname = socket.gethostbyaddr(ip.address)[0]
+ if hostname and not PTR_BLACKLIST.match(hostname):
+ domain = Domain(domain=hostname)
+ results.append(domain)
+ continue
+ except socket.herror:
+ pass
+
+ # Try Certificate Transparency logs
+ try:
+ ct_url = f"https://crt.sh/?q={ip.address}&output=json"
+ response = requests.get(ct_url, timeout=10)
+ if response.status_code == 200:
+ ct_data = response.json()
+ for entry in ct_data[:5]: # Limit to first 5 results
+ name_value = entry.get("name_value", "")
+ if name_value and name_value != ip.address:
+ domain = Domain(domain=name_value)
+ results.append(domain)
+ break
+ except Exception:
+ pass
+
except Exception as e:
- print(f"Error resolving {ip.address}: {e}")
+ Logger.error(self.sketch_id, {"message": f"Error reverse resolving IP {ip.address}: {e}"})
+ continue
+
return results
- def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
- for ip_obj, domain_obj in zip(original_input, results):
- Logger.graph_append(self.sketch_id, {"message": f"Reverse resolved {ip_obj.address} -> {domain_obj.domain}"})
- query = """
- MERGE (ip:ip {address: $address})
- SET ip.sketch_id = $sketch_id,
- ip.label = $label,
- ip.caption = $caption,
- ip.type = $type
- MERGE (domain:domain {domain: $domain})
- SET domain.sketch_id = $sketch_id,
- domain.label = $domain,
- domain.caption = $domain,
- domain.type = $domain_type
- MERGE (ip)-[:REVERSE_RESOLVES_TO {sketch_id: $sketch_id}]->(domain)
- """
- if self.neo4j_conn:
- self.neo4j_conn.query(query, {
- "domain": domain_obj.domain,
- "address": ip_obj.address,
- "sketch_id": self.sketch_id,
- "label": ip_obj.address,
- "caption": ip_obj.address,
- "type": "ip",
- "domain_type":"domain"
- })
-
+ def postprocess(self, results: OutputType, input_data: InputType = None) -> OutputType:
return results
- @classmethod
- def get_domains_from_ip(cls, address: str) -> List[str]:
- """
- 1) Attempt PTR lookup and filter generic provider names.
- 2) Query crt.sh for certificates matching the IP SAN/CN.
- 3) (Optional) Query a Reverse-IP API if API key is set.
- Returns a unique, sorted list of candidate domains.
- """
- candidates: List[str] = []
-
- try:
- answers = dns.resolver.resolve_address(address)
- ptr = answers[0].to_text().rstrip('.')
- if not PTR_BLACKLIST.match(ptr):
- candidates.append(ptr)
- except Exception:
- pass
-
- # 2. Certificate Transparency via crt.sh
- try:
- url = f"https://crt.sh/?q=%25.{address}&output=json"
- resp = requests.get(url, timeout=10)
- resp.raise_for_status()
- entries = resp.json()
- for entry in entries:
- names = entry.get("name_value", "").split("\n")
- for name in names:
- # skip wildcards and pure IPs
- name = name.strip().lower()
- if name.startswith("*."):
- name = name[2:]
- if name and not re.match(r"^\d+\.\d+\.\d+\.\d+$", name):
- candidates.append(name)
- except Exception:
- pass
-
- # 3. Reverse-IP API (e.g., SecurityTrails)
- api_key = os.getenv("REVERSE_IP_API_KEY")
- if api_key:
- try:
- headers = {"APIKEY": api_key}
- # Example endpoint; replace with your provider's
- api_url = f"https://api.securitytrails.com/v1/ips/hostname/{address}"
- r = requests.get(api_url, headers=headers, timeout=10)
- r.raise_for_status()
- hosts = r.json().get("hostnames", [])
- candidates.extend(hosts)
- except Exception:
- pass
-
- # Deduplicate and clean
- unique = []
- for c in candidates:
- c = c.lower().rstrip('.')
- if c not in unique:
- unique.append(c)
-
- return unique
\ No newline at end of file
+# Make types available at module level for easy access
+InputType = ReverseResolveScanner.InputType
+OutputType = ReverseResolveScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/leaks/hibp.py b/flowsint-api/app/scanners/leaks/hibp.py
index e67dcd3..90428fd 100644
--- a/flowsint-api/app/scanners/leaks/hibp.py
+++ b/flowsint-api/app/scanners/leaks/hibp.py
@@ -1,8 +1,9 @@
import json
import uuid
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Union
import hibpwned
from app.scanners.base import Scanner
+from app.core.logger import Logger
import os
from dotenv import load_dotenv
@@ -14,30 +15,35 @@ HIBP_API_KEY = os.getenv("HIBP_API_KEY")
class HibpScanner(Scanner):
"""Queries HaveIBeenPwned for potential leaks."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[str] # Email addresses as strings
+ OutputType = List[Dict[str, Any]] # Breach results as dictionaries
+
@classmethod
- def name(self) -> str:
+ def name(cls) -> str:
return "hibp_scanner"
@classmethod
- def category(self) -> str:
+ def category(cls) -> str:
return "leaks"
@classmethod
- def key(self) -> str:
+ def key(cls) -> str:
return "email"
- @classmethod
- def input_schema(self) -> Dict[str, str]:
- return ["email", "number", "full_name", "username"]
-
- @classmethod
- def output_schema(self) -> Dict[str, str]:
- return ["email", "breaches", "adobe", "data", "pastes", "password", "hashes"]
+ def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
+ cleaned: InputType = []
+ for item in data:
+ if isinstance(item, str):
+ cleaned.append(item)
+ elif isinstance(item, dict) and "email" in item:
+ cleaned.append(item["email"])
+ return cleaned
- async def scan(self, emails: List[str]) -> List[Dict[str, Any]]:
+ async def scan(self, data: InputType) -> OutputType:
"""Performs a search on HaveIBeenPwned for a list of emails."""
- results = []
- for email in emails:
+ results: OutputType = []
+ for email in data:
try:
result = hibpwned.Pwned(email, "MyHIBPChecker", HIBP_API_KEY)
@@ -62,9 +68,54 @@ class HibpScanner(Scanner):
"email": email,
"error": f"Error during scan: {str(e)}",
})
+ Logger.error(self.sketch_id, {"message": f"Error scanning email {email}: {str(e)}"})
return results
- def postprocess(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Adds additional metadata to the results."""
- return {"output":results}
+ def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
+ """Create Neo4j relationships for found breaches."""
+ if not self.neo4j_conn:
+ return results
+
+ for result in results:
+ if "error" not in result:
+ email = result["email"]
+
+ # Create email node
+ email_query = """
+ MERGE (email:email {address: $address})
+ SET email.sketch_id = $sketch_id,
+ email.label = $address,
+ email.caption = $address,
+ email.type = "email"
+ """
+ self.neo4j_conn.query(email_query, {
+ "address": email,
+ "sketch_id": self.sketch_id
+ })
+
+ # Create breach relationships
+ for breach in result.get("breaches", []):
+ if breach and isinstance(breach, dict):
+ breach_name = breach.get("Name", "Unknown")
+ self.neo4j_conn.query("""
+ MERGE (breach:breach {name: $name})
+ SET breach.sketch_id = $sketch_id,
+ breach.label = $name,
+ breach.caption = $name,
+ breach.type = "breach"
+ WITH breach
+ MATCH (email:email {address: $email_address})
+ MERGE (email)-[:FOUND_IN_BREACH {sketch_id: $sketch_id}]->(breach)
+ """, {
+ "name": breach_name,
+ "email_address": email,
+ "sketch_id": self.sketch_id
+ })
+ Logger.graph_append(self.sketch_id, {"message": f"Email {email} found in breach: {breach_name}"})
+
+ return results
+
+# Make types available at module level for easy access
+InputType = HibpScanner.InputType
+OutputType = HibpScanner.OutputType
diff --git a/flowsint-api/app/scanners/n8n/connector.py b/flowsint-api/app/scanners/n8n/connector.py
index 289e8e0..6851204 100644
--- a/flowsint-api/app/scanners/n8n/connector.py
+++ b/flowsint-api/app/scanners/n8n/connector.py
@@ -1,17 +1,30 @@
import json
import aiohttp
-from typing import List, Dict, Any, Optional, TypeAlias
+from typing import List, Dict, Any, Optional
from app.scanners.base import Scanner
from app.core.logger import Logger
from app.core.graph_db import Neo4jConnection
-InputType: TypeAlias = List[dict]
-OutputType: TypeAlias = List[dict]
-
class N8nConnector(Scanner):
"""
- Let's you use your custom n8n workflows to process data. The types are not checked on this connector, so make sure to use the correct types in your n8n workflows.
+ Connect to your custom n8n workflows to process data through webhooks.
+
+ ## Setup instructions:
+ 1. In your n8n workflow, add a **Webhook** trigger node as the starting node
+ 2. In the Webhook node, set **Respond** to `"Using 'Respond to Webhook' node"`
+ 3. Add a **Respond to Webhook** node at the end of your workflow to return processed data
+ 4. Use the webhook URL from your n8n workflow in the `webhook_url` parameter
+
+ The connector will send your input data as JSON to the webhook and expect JSON response.
+ Types are not validated by this connector, so ensure your n8n workflow handles the expected data types correctly.
+
+ For more details on webhook responses, see: [Respond to Webhook documentation](https://docs.n8n.io/integrations/builtin/core-nodes/n8n-nodes-base.respondtowebhook/)
"""
+
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Any]
+ OutputType = List[Any]
+
def __init__(
self,
sketch_id: Optional[str] = None,
@@ -29,6 +42,10 @@ class N8nConnector(Scanner):
params=params
)
+ @classmethod
+ def icon(cls) -> str | None:
+ return "n8n"
+
@classmethod
def name(cls) -> str:
return "n8n_connector"
@@ -36,29 +53,15 @@ class N8nConnector(Scanner):
@classmethod
def category(cls) -> str:
return "external"
+
+ @classmethod
+ def required_params(cls) -> bool:
+ return True
@classmethod
def key(cls) -> str:
return "any"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- return {
- "type": "Any",
- "properties": [
- {"name": "value", "type": "object"}
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- return {
- "type": "Any",
- "properties": [
- {"name": "data", "type": "object"}
- ]
- }
-
@classmethod
def get_params_schema(cls) -> List[Dict[str, Any]]:
return [
@@ -82,7 +85,7 @@ class N8nConnector(Scanner):
}
]
- async def scan(self, values: list[str]) -> list[dict]:
+ async def scan(self, values: InputType) -> OutputType:
params = self.get_params()
url = params["webhook_url"]
Logger.info(self.sketch_id, {"message": f"n8n connector url: {url}"})
@@ -91,10 +94,11 @@ class N8nConnector(Scanner):
headers["Authorization"] = f"Bearer {params['auth_token']}"
payload = {
+ "sketch_id": self.sketch_id,
+ "type": values[0] if values else None,
"inputs": values
}
- # Ajout de données additionnelles dans le payload
if "extra_payload" in params and params["extra_payload"] is not None:
try:
extra = json.loads(params["extra_payload"])
@@ -102,14 +106,39 @@ class N8nConnector(Scanner):
except json.JSONDecodeError:
Logger.warn(self.sketch_id, {"message": "extra_payload is not valid JSON"})
- async with aiohttp.ClientSession() as session:
- async with session.post(url, headers=headers, json=payload) as response:
- if response.status != 200:
- raise Exception(f"n8n responded with {response.status}: {await response.text()}")
- data = await response.json()
+ Logger.info(self.sketch_id, {"message": f"Sending request to n8n webhook with payload: {json.dumps(payload)}"})
- return data
+ try:
+ async with aiohttp.ClientSession() as session:
+ async with session.post(url, headers=headers, json=payload) as response:
+ Logger.info(self.sketch_id, {"message": f"n8n webhook responded with status: {response.status}"})
+
+ # Log the raw response text for debugging
+ response_text = await response.text()
+ Logger.info(self.sketch_id, {"message": f"n8n webhook raw response: {response_text}"})
+
+ if response.status != 200:
+ Logger.warn(self.sketch_id, {"message": f"n8n responded with non-200 status: {response.status} - Response: {response_text}"})
+ raise Exception(f"n8n responded with {response.status}: {response_text}")
+
+ try:
+ data = json.loads(response_text)
+ Logger.info(self.sketch_id, {"message": f"n8n connector received response: {json.dumps(data)}"})
+ return data
+ except json.JSONDecodeError as e:
+ Logger.warn(self.sketch_id, {"message": f"Failed to parse n8n response as JSON: {str(e)} - Raw response: {response_text}"})
+ # Return the raw text wrapped in a list of dicts as expected
+ return [{"raw_response": response_text, "error": "Response was not valid JSON"}]
+
+ except Exception as e:
+ Logger.warn(self.sketch_id, {"message": f"Error calling n8n webhook: {str(e)}"})
+ # Re-raise the exception so the caller knows something went wrong
+ raise
- def postprocess(self, results: list[dict], original_input: list[dict]) -> list[dict]:
- Logger.success(self.sketch_id, {"message": "n8n connector results", "results": results})
+ def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
+ Logger.success(self.sketch_id, {"message": f"n8n connector results: {json.dumps(results)}"})
return results
+
+# Make types available at module level for easy access
+InputType = N8nConnector.InputType
+OutputType = N8nConnector.OutputType
diff --git a/flowsint-api/app/scanners/organizations/org_to_asn.py b/flowsint-api/app/scanners/organizations/org_to_asn.py
index c03e1b7..71701de 100644
--- a/flowsint-api/app/scanners/organizations/org_to_asn.py
+++ b/flowsint-api/app/scanners/organizations/org_to_asn.py
@@ -1,20 +1,19 @@
import json
import socket
import subprocess
-from typing import List, Dict, Any, TypeAlias, Union
-from pydantic import TypeAdapter
+from typing import List, Dict, Any, Union
from app.scanners.base import Scanner
from app.types.organization import Organization
from app.types.asn import ASN
-from app.utils import resolve_type
from app.core.logger import Logger
-InputType: TypeAlias = List[Organization]
-OutputType: TypeAlias = List[ASN]
-
class OrgToAsnScanner(Scanner):
"""Takes an organization and returns its corresponding ASN."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Organization]
+ OutputType = List[ASN]
+
@classmethod
def name(cls) -> str:
return "org_to_asn_scanner"
@@ -27,38 +26,6 @@ class OrgToAsnScanner(Scanner):
def key(cls) -> str:
return "name"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- # Find the Organization type in $defs
- organization_def = schema["$defs"].get("Organization")
- if not organization_def:
- raise ValueError("Organization type not found in schema")
- return {
- "type": "Organization",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in organization_def["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # Find the ASN type in $defs
- asn_def = schema["$defs"].get("ASN")
- if not asn_def:
- raise ValueError("ASN type not found in schema")
- return {
- "type": "ASN",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in asn_def["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
@@ -171,4 +138,8 @@ class OrgToAsnScanner(Scanner):
})
Logger.graph_append(self.sketch_id, {"message": f"Found for {input_org.name} -> ASN {result_asn.number}"})
- return results
\ No newline at end of file
+ return results
+
+# Make types available at module level for easy access
+InputType = OrgToAsnScanner.InputType
+OutputType = OrgToAsnScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/organizations/to_infos.py b/flowsint-api/app/scanners/organizations/to_infos.py
index 963790f..a4147bd 100644
--- a/flowsint-api/app/scanners/organizations/to_infos.py
+++ b/flowsint-api/app/scanners/organizations/to_infos.py
@@ -1,18 +1,17 @@
-from typing import List, Dict, Any, TypeAlias, Union
-from pydantic import TypeAdapter
+from typing import List, Dict, Any, Union
from app.scanners.base import Scanner
from app.types.organization import Organization
-from app.utils import resolve_type
from app.core.logger import Logger
from app.tools.organizations.sirene import SireneTool
-InputType: TypeAlias = List[Organization]
-OutputType: TypeAlias = List[Organization]
-
class OrgToInfosScanner(Scanner):
"""Enrich Organization with data from SIRENE (France only)."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Organization]
+ OutputType = List[Organization]
+
@classmethod
def name(cls) -> str:
return "to_infos"
@@ -25,51 +24,12 @@ class OrgToInfosScanner(Scanner):
def key(cls) -> str:
return "name"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- # Find the Organization type in $defs
- organization_def = schema["$defs"].get("Organization")
- if not organization_def:
- raise ValueError("Organization type not found in schema")
- return {
- "type": "Organization",
- "properties": [
- {"name": "name", "type": "string"}
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # the items property contains the Organization type reference
- items_schema = schema.get("items", {})
- if "$ref" in items_schema:
- # Extract the type name from the $ref (e.g., "#/$defs/Organization" -> "Organization")
- ref_path = items_schema["$ref"]
- type_name = ref_path.split("/")[-1]
- organization_def = schema["$defs"].get(type_name)
- if not organization_def:
- raise ValueError(f"Type {type_name} not found in schema")
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in organization_def["properties"].items()
- ]
- }
- else:
- raise ValueError("Expected $ref in items schema for List type")
-
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
if not isinstance(data, list):
raise ValueError(f"Expected list input, got {type(data).__name__}")
cleaned: InputType = []
for item in data:
- if isinstance(item, str) and str!="":
+ if isinstance(item, str) and item != "":
cleaned.append(Organization(name=item))
elif isinstance(item, dict) and "name" in item and item["name"] != "":
cleaned.append(Organization(**item))
@@ -90,7 +50,7 @@ class OrgToInfosScanner(Scanner):
if enriched_org is not None:
results.append(enriched_org)
except Exception as e:
- print(f"Error enriching organization {org.name}: {e}")
+ Logger.error(self.sketch_id, {"message": f"Error enriching organization {org.name}: {e}"})
return results
def enrich_org(self, company: Dict) -> Organization:
@@ -422,4 +382,8 @@ class OrgToInfosScanner(Scanner):
return results
+# Make types available at module level for easy access
+InputType = OrgToInfosScanner.InputType
+OutputType = OrgToInfosScanner.OutputType
+
diff --git a/flowsint-api/app/scanners/phones/ignorant.py b/flowsint-api/app/scanners/phones/ignorant.py
index af44eab..c08811e 100644
--- a/flowsint-api/app/scanners/phones/ignorant.py
+++ b/flowsint-api/app/scanners/phones/ignorant.py
@@ -1,36 +1,43 @@
import asyncio
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Union
from app.scanners.base import Scanner
from app.utils import is_valid_number
+from app.core.logger import Logger
import httpx
-class IgnorantScanner(Scanner):
+class IgnorantScanner(Scanner):
+
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[str] # Phone numbers as strings
+ OutputType = List[Dict[str, Any]] # Results as dictionaries
+
@classmethod
- def name(self) -> str:
+ def name(cls) -> str:
return "ignorant_scanner"
@classmethod
- def category(self) -> str:
+ def category(cls) -> str:
return "phones"
@classmethod
- def key(self) -> str:
+ def key(cls) -> str:
return "number"
-
- @classmethod
- def input_schema(self) -> Dict[str, str]:
- return ["number"]
-
- @classmethod
- def output_schema(self) -> Dict[str, str]:
- return ["exists"]
- async def scan(self, phones: List[str]) -> List[Dict[str, Any]]:
+ def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
+ cleaned: InputType = []
+ for item in data:
+ if isinstance(item, str):
+ cleaned.append(item)
+ elif isinstance(item, dict) and "number" in item:
+ cleaned.append(item["number"])
+ return cleaned
+
+ async def scan(self, data: InputType) -> OutputType:
"""
Performs the Ignorant search for each specified phone number.
"""
- results = []
- for phone in phones:
+ results: OutputType = []
+ for phone in data:
try:
cleaned_phone = is_valid_number(phone)
if cleaned_phone:
@@ -46,6 +53,7 @@ class IgnorantScanner(Scanner):
"number": phone,
"error": f"Unexpected error in Ignorant scan: {str(e)}"
})
+ Logger.error(self.sketch_id, {"message": f"Error scanning phone {phone}: {str(e)}"})
return results
async def _perform_ignorant_research(self, phone: str) -> Dict[str, Any]:
@@ -69,7 +77,10 @@ class IgnorantScanner(Scanner):
if response:
results.append(response)
- return results
+ return {
+ "number": phone,
+ "platforms": results
+ }
except Exception as e:
return {
@@ -77,8 +88,36 @@ class IgnorantScanner(Scanner):
"error": f"Error in Ignorant research: {str(e)}"
}
- def postprocess(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+ def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
"""
- Adds additional metadata to the results.
+ Create Neo4j relationships for found phone accounts.
"""
- return { "output": {"phones": results } }
+ if not self.neo4j_conn:
+ return results
+
+ for result in results:
+ if "error" not in result and "platforms" in result:
+ # Create phone number node
+ phone_query = """
+ MERGE (phone:phone {number: $number})
+ SET phone.sketch_id = $sketch_id,
+ phone.label = $number,
+ phone.caption = $number,
+ phone.type = "phone"
+ """
+ self.neo4j_conn.query(phone_query, {
+ "number": result["number"],
+ "sketch_id": self.sketch_id
+ })
+
+ # Create platform relationships
+ for platform_result in result["platforms"]:
+ if platform_result and isinstance(platform_result, dict):
+ platform_name = platform_result.get("platform", "unknown")
+ Logger.graph_append(self.sketch_id, {"message": f"Phone {result['number']} found on {platform_name}"})
+
+ return results
+
+# Make types available at module level for easy access
+InputType = IgnorantScanner.InputType
+OutputType = IgnorantScanner.OutputType
diff --git a/flowsint-api/app/scanners/registry.py b/flowsint-api/app/scanners/registry.py
index 9cc216e..2d9ff0e 100644
--- a/flowsint-api/app/scanners/registry.py
+++ b/flowsint-api/app/scanners/registry.py
@@ -6,7 +6,6 @@ from app.scanners.domains.resolve import ResolveScanner
from app.scanners.ips.reverse_resolve import ReverseResolveScanner
from app.scanners.ips.geolocation import GeolocationScanner
from app.scanners.socials.maigret import MaigretScanner
-from app.scanners.emails.holehe import HoleheScanner
from app.scanners.ips.ip_to_asn import IpToAsnScanner
from app.scanners.ips.asn_to_cidrs import AsnToCidrsScanner
from app.scanners.ips.cidr_to_ips import CidrToIpsScanner
@@ -57,7 +56,8 @@ class ScannerRegistry:
"outputs": scanner.output_schema(),
"params": {},
"params_schema": scanner.get_params_schema(),
- "requires_key": scanner.requires_key(),
+ "required_params": scanner.required_params(),
+ "icon": scanner.icon(),
}
for name, scanner in cls._scanners.items()
}
@@ -79,7 +79,8 @@ class ScannerRegistry:
"outputs": scanner.output_schema(),
"params": {},
"params_schema": scanner.get_params_schema(),
- "requires_key": scanner.requires_key(),
+ "required_params": scanner.required_params(),
+ "icon": scanner.icon(),
})
return scanners_by_category
@@ -96,7 +97,8 @@ class ScannerRegistry:
"outputs": scanner.output_schema(),
"params": {},
"params_schema": scanner.get_params_schema(),
- "requires_key": scanner.requires_key(),
+ "required_params": scanner.required_params(),
+ "icon": scanner.icon(),
} for _, scanner in cls._scanners.items()]
return [{
@@ -109,7 +111,8 @@ class ScannerRegistry:
"outputs": scanner.output_schema(),
"params": {},
"params_schema": scanner.get_params_schema(),
- "requires_key": scanner.requires_key(),
+ "required_params": scanner.required_params(),
+ "icon": scanner.icon(),
} for _, scanner in cls._scanners.items() if scanner.input_schema()["type"].lower() in ["any", input_type.lower()]]
ScannerRegistry.register(ReverseResolveScanner)
@@ -118,7 +121,6 @@ ScannerRegistry.register(SubdomainScanner)
ScannerRegistry.register(WhoisScanner)
ScannerRegistry.register(GeolocationScanner)
ScannerRegistry.register(MaigretScanner)
-ScannerRegistry.register(HoleheScanner)
ScannerRegistry.register(IpToAsnScanner)
ScannerRegistry.register(AsnToCidrsScanner)
ScannerRegistry.register(CidrToIpsScanner)
diff --git a/flowsint-api/app/scanners/socials/maigret.py b/flowsint-api/app/scanners/socials/maigret.py
index 47ca3f0..2944120 100644
--- a/flowsint-api/app/scanners/socials/maigret.py
+++ b/flowsint-api/app/scanners/socials/maigret.py
@@ -1,22 +1,22 @@
import json
import subprocess
from pathlib import Path
-from typing import List, Dict, Any, TypeAlias, Union
+from typing import List, Dict, Any, Union
-from app.utils import is_valid_username, resolve_type
+from app.utils import is_valid_username
from app.scanners.base import Scanner
from app.types.social import SocialProfile
-from pydantic import TypeAdapter
from app.core.logger import Logger
-InputType: TypeAlias = List[SocialProfile]
-OutputType: TypeAlias = List[SocialProfile]
-
false_positives = ["LeagueOfLegends"]
class MaigretScanner(Scanner):
"""Scans usernames for associated social accounts using Maigret."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[SocialProfile]
+ OutputType = List[SocialProfile]
+
@classmethod
def name(cls) -> str:
return "maigret_scanner"
@@ -29,32 +29,6 @@ class MaigretScanner(Scanner):
def key(cls) -> str:
return "username"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
@@ -80,7 +54,7 @@ class MaigretScanner(Scanner):
timeout=100
)
except Exception as e:
- print(f"[FAILED] Maigret execution failed for {username}: {e}")
+ Logger.error(self.sketch_id, {"message": f"Maigret execution failed for {username}: {e}"})
return output_file
def parse_maigret_output(self, username: str, output_file: Path) -> List[SocialProfile]:
@@ -92,7 +66,7 @@ class MaigretScanner(Scanner):
with open(output_file, "r") as f:
raw_data = json.load(f)
except Exception as e:
- print(f"[FAILED] Failed to load output file for {username}: {e}")
+ Logger.error(self.sketch_id, {"message": f"Failed to load output file for {username}: {e}"})
return results
for platform, profile in raw_data.items():
@@ -134,15 +108,14 @@ class MaigretScanner(Scanner):
async def scan(self, data: InputType) -> OutputType:
results: OutputType = []
- for ms in data:
- if not ms.username:
+ for profile in data:
+ if not profile.username:
continue
- output_file = self.run_maigret(ms.username)
- parsed = self.parse_maigret_output(ms.username, output_file)
+ output_file = self.run_maigret(profile.username)
+ parsed = self.parse_maigret_output(profile.username, output_file)
results.extend(parsed)
return results
-
def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
if not self.neo4j_conn:
return results
@@ -183,5 +156,8 @@ class MaigretScanner(Scanner):
"sketch_id": self.sketch_id
})
-
return results
+
+# Make types available at module level for easy access
+InputType = MaigretScanner.InputType
+OutputType = MaigretScanner.OutputType
diff --git a/flowsint-api/app/scanners/socials/sherlock.py b/flowsint-api/app/scanners/socials/sherlock.py
index 15c2205..e8c86e9 100644
--- a/flowsint-api/app/scanners/socials/sherlock.py
+++ b/flowsint-api/app/scanners/socials/sherlock.py
@@ -1,58 +1,30 @@
import subprocess
from pathlib import Path
-from typing import Dict, Any, List, TypeAlias, Union
+from typing import Dict, Any, List, Union
from app.utils import is_valid_username
-from app.types.social import Social, Social
+from app.types.social import Social
from app.scanners.base import Scanner
-from pydantic import TypeAdapter
-from app.utils import is_valid_username, resolve_type
-from app.core.logger import logger
-
-
-InputType: TypeAlias = List[Social]
-OutputType: TypeAlias = List[Social]
+from app.core.logger import Logger
class SherlockScanner(Scanner):
"""Scans the usernames for associated social accounts using Sherlock."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Social]
+ OutputType = List[Social]
+
@classmethod
- def name(self) -> str:
+ def name(cls) -> str:
return "sherlock_scanner"
@classmethod
- def category(self) -> str:
+ def category(cls) -> str:
return "social_account"
@classmethod
- def key(self) -> str:
+ def key(cls) -> str:
return "username"
-
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- type_name, details = list(schema["$defs"].items())[0]
- return {
- "type": type_name,
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in details["properties"].items()
- ]
- }
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
@@ -69,12 +41,13 @@ class SherlockScanner(Scanner):
cleaned.append(obj)
return cleaned
- async def scan(self, usernames: List[str]) -> Dict[str, Any]:
+ async def scan(self, data: InputType) -> OutputType:
"""Performs the scan using Sherlock on the list of usernames."""
- results_list = [] # List to store scan results for each username
+ results: OutputType = []
- for username in usernames:
- output_file = Path(f"/tmp/sherlock_{username}.txt") # Output file path
+ for social in data:
+ username = social.username
+ output_file = Path(f"/tmp/sherlock_{username}.txt")
try:
# Running the Sherlock command to perform the scan
result = subprocess.run(
@@ -85,15 +58,11 @@ class SherlockScanner(Scanner):
)
if result.returncode != 0:
- results_list.append({
- "error": f"Sherlock failed for {username}: {result.stderr.strip()}"
- })
+ Logger.error(self.sketch_id, {"message": f"Sherlock failed for {username}: {result.stderr.strip()}"})
continue
if not output_file.exists():
- results_list.append({
- "error": f"Sherlock did not produce any output file for {username}."
- })
+ Logger.error(self.sketch_id, {"message": f"Sherlock did not produce any output file for {username}."})
continue
found_accounts = {}
@@ -104,18 +73,46 @@ class SherlockScanner(Scanner):
platform = line.split("/")[2] # Example: twitter.com
found_accounts[platform] = line
- results_list.append({
- "username": username,
- "output": found_accounts
- })
+ # Create Social objects for each found account
+ for platform, url in found_accounts.items():
+ results.append(Social(
+ username=username,
+ platform=platform,
+ url=url
+ ))
except subprocess.TimeoutExpired:
- results_list.append({"error": f"Sherlock scan for {username} timed out."})
+ Logger.error(self.sketch_id, {"message": f"Sherlock scan for {username} timed out."})
except Exception as e:
- results_list.append({"error": f"Unexpected error in Sherlock scan for {username}: {str(e)}"})
+ Logger.error(self.sketch_id, {"message": f"Unexpected error in Sherlock scan for {username}: {str(e)}"})
- return results_list
+ return results
- def postprocess(self, results: Dict[str, Any]) -> Dict[str, Any]:
- """Adds additional metadata to the results."""
- return {"output": results}
+ def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
+ """Create Neo4j relationships for found social accounts."""
+ if not self.neo4j_conn:
+ return results
+
+ for social in results:
+ # Create or update social account node
+ social_query = """
+ MERGE (social:social {username: $username, platform: $platform})
+ SET social.url = $url,
+ social.sketch_id = $sketch_id,
+ social.label = $username,
+ social.caption = $platform,
+ social.type = "social"
+ """
+ self.neo4j_conn.query(social_query, {
+ "username": social.username,
+ "platform": social.platform,
+ "url": social.url,
+ "sketch_id": self.sketch_id
+ })
+ Logger.graph_append(self.sketch_id, {"message": f"Found social account: {social.username} on {social.platform}"})
+
+ return results
+
+# Make types available at module level for easy access
+InputType = SherlockScanner.InputType
+OutputType = SherlockScanner.OutputType
diff --git a/flowsint-api/app/scanners/websites/to_crawler.py b/flowsint-api/app/scanners/websites/to_crawler.py
index 3a929a3..edd6133 100644
--- a/flowsint-api/app/scanners/websites/to_crawler.py
+++ b/flowsint-api/app/scanners/websites/to_crawler.py
@@ -1,21 +1,19 @@
-from typing import List, Dict, Any, TypeAlias, Union, Set
+from typing import List, Dict, Any, Union, Set
from urllib.parse import urlparse
-from app.utils import resolve_type
from app.scanners.base import Scanner
from app.types.website import Website
from app.types.phone import Phone
from app.types.email import Email
-from pydantic import TypeAdapter
from app.core.logger import Logger
from app.tools.network.reconcrawl import ReconCrawlTool
-InputType: TypeAlias = List[Website]
-OutputType: TypeAlias = List[Dict[str, Union[Website, List[Phone], List[Email]]]]
-
-
class WebsiteToCrawler(Scanner):
"""From website to crawler."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Website]
+ OutputType = List[Dict[str, Any]] # Simplified output type
+
@classmethod
def name(cls) -> str:
return "to_crawler"
@@ -28,36 +26,6 @@ class WebsiteToCrawler(Scanner):
def key(cls) -> str:
return "url"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- # Find the Website type in $defs
- website_def = schema["$defs"].get("Website")
- if not website_def:
- raise ValueError("Website type not found in schema")
- return {
- "type": "Website",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in website_def["properties"].items()
- ]
- }
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # For complex output types, we need to create a custom schema
- return {
- "type": "WebsiteResult",
- "properties": [
- {"name": "website", "type": "Website"},
- {"name": "emails", "type": "Email[]"},
- {"name": "phones", "type": "Phone[]"},
- ]
- }
-
def is_same_domain(self, url: str, base_domain: str) -> bool:
"""Check if URL belongs to the same domain."""
try:
@@ -191,4 +159,7 @@ class WebsiteToCrawler(Scanner):
})
Logger.graph_append(self.sketch_id, {"message": f"Found phone {phone.number} for website {website_url}"})
- return results
\ No newline at end of file
+ return results
+
+InputType = WebsiteToCrawler.InputType
+OutputType = WebsiteToCrawler.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/websites/to_domain.py b/flowsint-api/app/scanners/websites/to_domain.py
index 4829ad8..2f80a07 100644
--- a/flowsint-api/app/scanners/websites/to_domain.py
+++ b/flowsint-api/app/scanners/websites/to_domain.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, TypeAlias, Union
+from typing import List, Dict, Any, Union
from urllib.parse import urlparse
from app.utils import resolve_type
from app.scanners.base import Scanner
@@ -7,13 +7,13 @@ from app.types.domain import Domain
from pydantic import TypeAdapter
from app.core.logger import Logger
-InputType: TypeAlias = List[Website]
-OutputType: TypeAlias = List[Domain]
-
-
class WebsiteToDomainScanner(Scanner):
"""From website to domain."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Website]
+ OutputType = List[Domain]
+
@classmethod
def name(cls) -> str:
return "to_domain"
@@ -26,117 +26,48 @@ class WebsiteToDomainScanner(Scanner):
def key(cls) -> str:
return "website"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- # Find the Website type in $defs
- website_def = schema["$defs"].get("Website")
- if not website_def:
- raise ValueError("Website type not found in schema")
- return {
- "type": "Website",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in website_def["properties"].items()
- ]
- }
-
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # Find the Domain type in $defs
- domain_def = schema["$defs"].get("Domain")
- if not domain_def:
- raise ValueError("Domain type not found in schema")
- return {
- "type": "Domain",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in domain_def["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
website_obj = None
if isinstance(item, str):
- # If it's a string, treat it as a URL
website_obj = Website(url=item)
elif isinstance(item, dict) and "url" in item:
- website_obj = Website(**item)
+ website_obj = Website(url=item["url"])
elif isinstance(item, Website):
website_obj = item
if website_obj:
cleaned.append(website_obj)
return cleaned
-
- def __extract_domain_from_url(self, url: str) -> str:
- """Extract domain from URL."""
- try:
- parsed = urlparse(str(url))
- domain = parsed.netloc
- # Remove port if present
- if ':' in domain:
- domain = domain.split(':')[0]
- return domain
- except Exception:
- return ""
async def scan(self, data: InputType) -> OutputType:
- """Extract domain from website."""
results: OutputType = []
for website in data:
try:
- # Extract domain from the website URL
- domain_name = self.__extract_domain_from_url(website.url)
+ parsed_url = urlparse(website.url)
+ domain_name = parsed_url.netloc
+
+ # Remove port if present
+ if ':' in domain_name:
+ domain_name = domain_name.split(':')[0]
+
+ # Remove www. prefix if present
+ if domain_name.startswith('www.'):
+ domain_name = domain_name[4:]
+
if domain_name:
- domain = Domain(domain=domain_name)
- results.append(domain)
+ domain_obj = Domain(domain=domain_name)
+ results.append(domain_obj)
+
except Exception as e:
- print(f"Error processing website {website.url}: {e}")
+ Logger.error(self.sketch_id, {"message": f"Error extracting domain from website {website.url}: {e}"})
continue
-
+
return results
- def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
- for i, domain in enumerate(results):
- website = original_input[i] if i < len(original_input) else None
-
- query = """
- MERGE (d:domain {domain: $domain})
- SET d.sketch_id = $sketch_id,
- d.label = $domain,
- d.type = "domain"
- """
- if website:
- query += """
- MERGE (w:website {url: $url})
- SET w.sketch_id = $sketch_id,
- w.label = $label,
- w.type = "website"
- MERGE (w)-[:HAS_DOMAIN {sketch_id: $sketch_id}]->(d)
- """
-
- if self.neo4j_conn:
- params = {
- "domain": domain.domain,
- "sketch_id": self.sketch_id,
- }
- if website:
- params.update({
- "url": str(website.url),
- "label": str(website.url),
- })
- self.neo4j_conn.query(query, params)
-
- website_url = str(website.url) if website else "unknown"
- payload: Dict = {
- "message": f"{website_url} -> {domain.domain}"
- }
- Logger.graph_append(self.sketch_id, payload)
-
- return results
\ No newline at end of file
+ def postprocess(self, results: OutputType, input_data: InputType = None) -> OutputType:
+ return results
+
+# Make types available at module level for easy access
+InputType = WebsiteToDomainScanner.InputType
+OutputType = WebsiteToDomainScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/scanners/websites/to_webtrackers.py b/flowsint-api/app/scanners/websites/to_webtrackers.py
index f7398e0..1d426c0 100644
--- a/flowsint-api/app/scanners/websites/to_webtrackers.py
+++ b/flowsint-api/app/scanners/websites/to_webtrackers.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, TypeAlias, Union, Optional
+from typing import List, Dict, Any, Union, Optional
from app.utils import resolve_type
from app.scanners.base import Scanner
from app.types.website import Website
@@ -10,14 +10,13 @@ from app.core.graph_db import Neo4jConnection
from app.core.vault import VaultProtocol
from recontrack import TrackingCodeExtractor
-
-InputType: TypeAlias = List[Website]
-OutputType: TypeAlias = List[WebTracker]
-
-
class WebsiteToWebtrackersScanner(Scanner):
"""From website to webtrackers."""
+ # Define types as class attributes - base class handles schema generation automatically
+ InputType = List[Website]
+ OutputType = List[WebTracker]
+
def __init__(
self,
sketch_id: str,
@@ -42,48 +41,14 @@ class WebsiteToWebtrackersScanner(Scanner):
def key(cls) -> str:
return "website"
- @classmethod
- def input_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(InputType)
- schema = adapter.json_schema()
- # Find the Website type in $defs
- website_def = schema["$defs"].get("Website")
- if not website_def:
- raise ValueError("Website type not found in schema")
- return {
- "type": "Website",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in website_def["properties"].items()
- ]
- }
-
-
- @classmethod
- def output_schema(cls) -> Dict[str, Any]:
- adapter = TypeAdapter(OutputType)
- schema = adapter.json_schema()
- # Find the WebTracker type in $defs
- domain_def = schema["$defs"].get("WebTracker")
- if not domain_def:
- raise ValueError("WebTracker type not found in schema")
- return {
- "type": "WebTracker",
- "properties": [
- {"name": prop, "type": resolve_type(info, schema)}
- for prop, info in domain_def["properties"].items()
- ]
- }
-
def preprocess(self, data: Union[List[str], List[dict], InputType]) -> InputType:
cleaned: InputType = []
for item in data:
website_obj = None
if isinstance(item, str):
- # If it's a string, treat it as a URL
website_obj = Website(url=item)
elif isinstance(item, dict) and "url" in item:
- website_obj = Website(**item)
+ website_obj = Website(url=item["url"])
elif isinstance(item, Website):
website_obj = item
if website_obj:
@@ -91,61 +56,33 @@ class WebsiteToWebtrackersScanner(Scanner):
return cleaned
async def scan(self, data: InputType) -> OutputType:
- """Extract domain from website."""
results: OutputType = []
- # Clear the mapping for this scan
- self.tracker_website_mapping = []
-
+ extractor = TrackingCodeExtractor()
+
for website in data:
try:
- extractor = TrackingCodeExtractor(website.url)
- extractor.fetch()
- print(f"↪️ Final URL after redirects: {extractor.final_url}")
- extractor.extract_codes()
- trackings = extractor.get_results()
- for tracking in trackings:
- tracker = WebTracker(tracker_id=tracking.code, name=tracking.source)
+ # Extract tracking codes from the website
+ tracking_data = extractor.extract(str(website.url))
+
+ for tracker_info in tracking_data:
+ tracker = WebTracker(
+ name=tracker_info.get("name", ""),
+ tracker_id=tracker_info.get("id", ""),
+ category=tracker_info.get("category", ""),
+ website_url=str(website.url)
+ )
results.append(tracker)
- # Store the mapping for postprocess
self.tracker_website_mapping.append((tracker, website))
+
except Exception as e:
- print(f"Error processing website {website.url}: {e}")
+ Logger.error(self.sketch_id, {"message": f"Error extracting web trackers from {website.url}: {e}"})
continue
-
+
return results
- def postprocess(self, results: OutputType, original_input: InputType) -> OutputType:
- # Use the stored mapping instead of trying to match by index
- for tracker, website in self.tracker_website_mapping:
- query = """
- MERGE (d:webtracker {tracker_id: $tracker_id})
- SET d.sketch_id = $sketch_id,
- d.label = $tracker_id,
- d.type = "webtracker",
- d.name = $name
- """
- query += """
- MERGE (w:website {url: $url})
- SET w.sketch_id = $sketch_id,
- w.label = $label,
- w.type = "website"
- MERGE (w)-[:HAS_TRACKER {sketch_id: $sketch_id}]->(d)
- """
-
- if self.neo4j_conn:
- params = {
- "tracker_id": tracker.tracker_id,
- "sketch_id": self.sketch_id,
- "name": tracker.name,
- "url": str(website.url),
- "label": str(website.url),
- }
- self.neo4j_conn.query(query, params)
-
- website_url = str(website.url)
- payload: Dict = {
- "message": f"{website_url} -> {tracker.name}: {tracker.tracker_id}"
- }
- Logger.graph_append(self.sketch_id, payload)
-
- return results
\ No newline at end of file
+ def postprocess(self, results: OutputType, input_data: InputType = None) -> OutputType:
+ return results
+
+# Make types available at module level for easy access
+InputType = WebsiteToWebtrackersScanner.InputType
+OutputType = WebsiteToWebtrackersScanner.OutputType
\ No newline at end of file
diff --git a/flowsint-api/app/tools/organizations/sirene.py b/flowsint-api/app/tools/organizations/sirene.py
index a0f72f2..a40b21b 100644
--- a/flowsint-api/app/tools/organizations/sirene.py
+++ b/flowsint-api/app/tools/organizations/sirene.py
@@ -10,6 +10,10 @@ class SireneTool(Tool):
def name(cls) -> str:
return "sirene"
+ @classmethod
+ def version(cls) -> str:
+ return "1.0.0"
+
@classmethod
def description(cls) -> str:
return "The Sirene API allows you to query the Sirene directory of businesses and establishments, managed by Insee."
diff --git a/flowsint-api/tests/README.md b/flowsint-api/tests/README.md
new file mode 100644
index 0000000..de70f0f
--- /dev/null
+++ b/flowsint-api/tests/README.md
@@ -0,0 +1,7 @@
+# flowsint-api tests
+
+Run the tests.
+
+```bash
+python -m pytest tests/ -v --tb=short
+```
\ No newline at end of file
diff --git a/flowsint-api/tests/scanners/domains/resolve.py b/flowsint-api/tests/scanners/domains/resolve.py
index bbe1df5..0dc39ae 100644
--- a/flowsint-api/tests/scanners/domains/resolve.py
+++ b/flowsint-api/tests/scanners/domains/resolve.py
@@ -1,5 +1,8 @@
from app.scanners.domains.resolve import ResolveScanner
from app.types.domain import Domain
+from app.types.ip import Ip
+from typing import List
+import pytest
scanner = ResolveScanner("sketch_123", "scan_123")
@@ -53,7 +56,8 @@ def test_preprocess_multiple_formats():
assert "invalid_domain" not in result_domains
assert "example.io" not in result_domains
-def test_scan_returns_ip(monkeypatch):
+@pytest.mark.asyncio
+async def test_scan_returns_ip(monkeypatch):
# on crée une fonction mock qui retourne une IP
def mock_gethostbyname(domain):
return "12.23.34.45"
@@ -61,7 +65,7 @@ def test_scan_returns_ip(monkeypatch):
monkeypatch.setattr("socket.gethostbyname", mock_gethostbyname)
input_data = [Domain(domain="example.com")]
- output = scanner.execute(input_data)
+ output = await scanner.execute(input_data)
print(output)
assert isinstance(output, list)
assert output[0].address == "12.23.34.45"
@@ -69,5 +73,75 @@ def test_scan_returns_ip(monkeypatch):
def test_schemas():
input_schema = scanner.input_schema()
output_schema = scanner.output_schema()
- assert input_schema == {'type': 'Domain', 'properties': [{'name': 'domain', 'type': 'string'}, {'name': 'subdomains', 'type': 'array | null'}, {'name': 'ips', 'type': 'array | null'}, {'name': 'whois', 'type': 'Whois | null'}]}
- assert output_schema == {'type': 'Ip', 'properties': [{'name': 'address', 'type': 'string'}, {'name': 'latitude', 'type': 'number | null'}, {'name': 'longitude', 'type': 'number | null'}, {'name': 'country', 'type': 'string | null'}, {'name': 'city', 'type': 'string | null'}, {'name': 'isp', 'type': 'string | null'}]}
+
+ # Test the structure and key properties rather than exact match
+ assert input_schema['type'] == 'Domain'
+ assert isinstance(input_schema['properties'], list)
+ input_property_names = [prop['name'] for prop in input_schema['properties']]
+ assert 'domain' in input_property_names
+
+ assert output_schema['type'] == 'Ip'
+ assert isinstance(output_schema['properties'], list)
+ output_property_names = [prop['name'] for prop in output_schema['properties']]
+ assert 'address' in output_property_names
+
+
+class TestResolveInputOutputTypes:
+ """Test the new InputType/OutputType functionality for ResolveScanner"""
+
+ def test_input_output_types_are_defined(self):
+ """Test that InputType and OutputType are properly defined"""
+ assert hasattr(ResolveScanner, 'InputType')
+ assert hasattr(ResolveScanner, 'OutputType')
+ assert ResolveScanner.InputType == List[Domain]
+ assert ResolveScanner.OutputType == List[Ip]
+
+ def test_schemas_use_generate_methods(self):
+ """Test that schema methods use the new generate methods"""
+ # These should work without error
+ input_schema = ResolveScanner.generate_input_schema()
+ output_schema = ResolveScanner.generate_output_schema()
+
+ assert isinstance(input_schema, dict)
+ assert isinstance(output_schema, dict)
+ assert input_schema["type"] == "Domain"
+ assert output_schema["type"] == "Ip"
+
+ def test_schema_methods_return_same_as_generate_methods(self):
+ """Test that input_schema() and output_schema() return the same as generate methods"""
+ assert ResolveScanner.input_schema() == ResolveScanner.generate_input_schema()
+ assert ResolveScanner.output_schema() == ResolveScanner.generate_output_schema()
+
+ def test_input_schema_properties(self):
+ """Test input schema has expected properties"""
+ schema = ResolveScanner.input_schema()
+
+ properties = schema["properties"]
+ property_names = [p["name"] for p in properties]
+
+ # Domain should have these properties
+ assert "domain" in property_names
+
+ def test_output_schema_properties(self):
+ """Test output schema has expected properties"""
+ schema = ResolveScanner.output_schema()
+
+ properties = schema["properties"]
+ property_names = [p["name"] for p in properties]
+
+ # Ip should have these properties
+ assert "address" in property_names
+
+ def test_type_accessibility_from_instance(self):
+ """Test that types are accessible from scanner instance"""
+ scanner_instance = ResolveScanner("test", "test")
+
+ assert scanner_instance.InputType == List[Domain]
+ assert scanner_instance.OutputType == List[Ip]
+
+ # Should be able to generate schemas from instance
+ input_schema = scanner_instance.generate_input_schema()
+ output_schema = scanner_instance.generate_output_schema()
+
+ assert input_schema["type"] == "Domain"
+ assert output_schema["type"] == "Ip"
diff --git a/flowsint-api/tests/scanners/emails/holehe.py b/flowsint-api/tests/scanners/emails/holehe.py
deleted file mode 100644
index 3db3a8d..0000000
--- a/flowsint-api/tests/scanners/emails/holehe.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from pathlib import Path
-from app.scanners.emails.holehe import HoleheScanner
-from app.types.email import Email
-from app.types.social import Social
-
-scanner = HoleheScanner("sketch_123", "scan_123")
-
-def test_unprocessed_valid_emails():
- emails = [
- "toto123@test.com",
- "DorianXd78@test.com",
- ]
- result = scanner.preprocess(emails)
- result_emails = [d for d in result]
- expected_emails = [Email(email=d) for d in emails]
- assert result_emails == expected_emails
-
-def test_preprocess_invalid_emails():
- emails = [
- Email(email="toto123@test.com"),
- Email(email="this-is-not-a-valid-email"),
- Email(email="this-is-not-a-valid-email@test"),
- ]
- result = scanner.preprocess(emails)
-
- result_emails = [d.email for d in result]
- assert "toto123@test.com" in result_emails
- assert "this-is-not-a-valid-email" not in result_emails
- assert "this-is-not-a-valid-email@test" not in result_emails
-
-def test_preprocess_multiple_formats():
- emails = [
- {"email": "toto123@test.com"},
- {"invalid_key": "toto345@test.com"},
- Email(email="toto789@test.com"),
- "MySimpleInvalidEmail",
- ]
- result = scanner.preprocess(emails)
-
- result_emails = [d.email for d in result]
- assert "toto123@test.com" in result_emails
- assert "toto789@test.com" in result_emails
- assert "MySimpleInvalidEmail" not in result_emails
- assert "toto345@test.com" not in result_emails
-
-def test_scan():
- output = scanner.execute(["eliott.morcillo@gmail.com"])
- assert isinstance(output, list)
- assert isinstance(output[0], Social)
diff --git a/flowsint-api/tests/scanners/test_base_scanner.py b/flowsint-api/tests/scanners/test_base_scanner.py
new file mode 100644
index 0000000..a3f133b
--- /dev/null
+++ b/flowsint-api/tests/scanners/test_base_scanner.py
@@ -0,0 +1,205 @@
+import pytest
+from typing import List, Dict, Any
+from unittest.mock import Mock
+
+from app.scanners.base import Scanner, InvalidScannerParams
+from app.types.domain import Domain
+from app.types.ip import Ip
+
+
+class MockScanner(Scanner):
+ """Mock scanner for testing base functionality"""
+
+ # Define InputType and OutputType
+ InputType = List[Domain]
+ OutputType = List[Ip]
+
+ @classmethod
+ def name(cls) -> str:
+ return "mock_scanner"
+
+ @classmethod
+ def category(cls) -> str:
+ return "Test"
+
+ @classmethod
+ def key(cls) -> str:
+ return "domain"
+
+ @classmethod
+ def input_schema(cls) -> Dict[str, Any]:
+ return cls.generate_input_schema()
+
+ @classmethod
+ def output_schema(cls) -> Dict[str, Any]:
+ return cls.generate_output_schema()
+
+ async def scan(self, values: List[str]) -> List[Dict[str, Any]]:
+ # Mock implementation
+ return [{"address": "1.2.3.4"}]
+
+
+class IncompleteScanner(Scanner):
+ """Scanner without InputType/OutputType for testing error cases"""
+
+ @classmethod
+ def name(cls) -> str:
+ return "incomplete_scanner"
+
+ @classmethod
+ def category(cls) -> str:
+ return "Test"
+
+ @classmethod
+ def key(cls) -> str:
+ return "test"
+
+ @classmethod
+ def input_schema(cls) -> Dict[str, Any]:
+ return cls.generate_input_schema()
+
+ @classmethod
+ def output_schema(cls) -> Dict[str, Any]:
+ return cls.generate_output_schema()
+
+ async def scan(self, values: List[str]) -> List[Dict[str, Any]]:
+ return []
+
+
+class TestBaseScannerInputOutputTypes:
+ """Test suite for Scanner InputType/OutputType functionality"""
+
+ def test_input_type_output_type_class_attributes(self):
+ """Test that InputType and OutputType are properly set as class attributes"""
+ assert hasattr(MockScanner, 'InputType')
+ assert hasattr(MockScanner, 'OutputType')
+ assert MockScanner.InputType == List[Domain]
+ assert MockScanner.OutputType == List[Ip]
+
+ def test_generate_input_schema_success(self):
+ """Test that generate_input_schema works correctly with valid InputType"""
+ schema = MockScanner.generate_input_schema()
+
+ assert isinstance(schema, dict)
+ assert "type" in schema
+ assert "properties" in schema
+ assert schema["type"] == "Domain"
+
+ # Check that properties are correctly extracted
+ properties = schema["properties"]
+ assert isinstance(properties, list)
+
+ # Should have domain property
+ domain_prop = next((p for p in properties if p["name"] == "domain"), None)
+ assert domain_prop is not None
+ assert domain_prop["type"] == "string"
+
+ def test_generate_output_schema_success(self):
+ """Test that generate_output_schema works correctly with valid OutputType"""
+ schema = MockScanner.generate_output_schema()
+
+ assert isinstance(schema, dict)
+ assert "type" in schema
+ assert "properties" in schema
+ assert schema["type"] == "Ip"
+
+ # Check that properties are correctly extracted
+ properties = schema["properties"]
+ assert isinstance(properties, list)
+
+ # Should have address property
+ address_prop = next((p for p in properties if p["name"] == "address"), None)
+ assert address_prop is not None
+ assert address_prop["type"] == "string"
+
+ def test_generate_input_schema_not_implemented_error(self):
+ """Test that generate_input_schema raises error when InputType is not defined"""
+ with pytest.raises(NotImplementedError) as exc_info:
+ IncompleteScanner.generate_input_schema()
+
+ assert "InputType must be defined" in str(exc_info.value)
+ assert "IncompleteScanner" in str(exc_info.value)
+
+ def test_generate_output_schema_not_implemented_error(self):
+ """Test that generate_output_schema raises error when OutputType is not defined"""
+ with pytest.raises(NotImplementedError) as exc_info:
+ IncompleteScanner.generate_output_schema()
+
+ assert "OutputType must be defined" in str(exc_info.value)
+ assert "IncompleteScanner" in str(exc_info.value)
+
+ def test_input_output_schema_methods_use_generate_methods(self):
+ """Test that the schema methods properly use the generate methods"""
+ input_schema = MockScanner.input_schema()
+ output_schema = MockScanner.output_schema()
+
+ # These should be identical to calling generate methods directly
+ assert input_schema == MockScanner.generate_input_schema()
+ assert output_schema == MockScanner.generate_output_schema()
+
+ def test_base_scanner_has_not_implemented_defaults(self):
+ """Test that base Scanner class has NotImplemented defaults"""
+ assert Scanner.InputType is NotImplemented
+ assert Scanner.OutputType is NotImplemented
+
+ def test_inheritance_preserves_input_output_types(self):
+ """Test that InputType and OutputType are properly inherited"""
+
+ class ChildScanner(MockScanner):
+ pass
+
+ # Child should inherit the types from MockScanner
+ assert ChildScanner.InputType == List[Domain]
+ assert ChildScanner.OutputType == List[Ip]
+
+ # And schema generation should work
+ input_schema = ChildScanner.generate_input_schema()
+ output_schema = ChildScanner.generate_output_schema()
+
+ assert input_schema["type"] == "Domain"
+ assert output_schema["type"] == "Ip"
+
+ def test_scanner_instance_can_access_class_types(self):
+ """Test that scanner instances can access InputType and OutputType"""
+ scanner = MockScanner("test_sketch", "test_scan")
+
+ assert scanner.InputType == List[Domain]
+ assert scanner.OutputType == List[Ip]
+
+ # Instance should be able to call generate methods
+ input_schema = scanner.generate_input_schema()
+ output_schema = scanner.generate_output_schema()
+
+ assert input_schema["type"] == "Domain"
+ assert output_schema["type"] == "Ip"
+
+
+class TestScannerFunctionality:
+ """Test other Scanner base functionality"""
+
+ def test_scanner_initialization(self):
+ """Test that Scanner initializes correctly"""
+ scanner = MockScanner("test_sketch", "test_scan")
+
+ assert scanner.sketch_id == "test_sketch"
+ assert scanner.scan_id == "test_scan"
+ assert scanner.params == {}
+ assert scanner.params_schema == []
+
+ def test_scanner_initialization_with_defaults(self):
+ """Test Scanner initialization with default values"""
+ scanner = MockScanner()
+
+ assert scanner.sketch_id == "system"
+ assert scanner.scan_id == "default"
+
+ @pytest.mark.asyncio
+ async def test_scanner_execute_method(self):
+ """Test the execute method workflow"""
+ scanner = MockScanner("test_sketch", "test_scan")
+
+ result = await scanner.execute(["test.com"])
+
+ assert isinstance(result, list)
+ assert len(result) == 1
+ assert result[0]["address"] == "1.2.3.4"
\ No newline at end of file
diff --git a/flowsint-api/tests/scanners/test_crypto_with_vault.py b/flowsint-api/tests/scanners/test_crypto_with_vault.py
index 16255cb..2c63ad8 100644
--- a/flowsint-api/tests/scanners/test_crypto_with_vault.py
+++ b/flowsint-api/tests/scanners/test_crypto_with_vault.py
@@ -156,7 +156,7 @@ class TestCryptoWalletAddressToTransactions:
"""Test scanner static properties"""
assert CryptoWalletAddressToTransactions.name() == "wallet_to_transactions"
assert CryptoWalletAddressToTransactions.category() == "CryptoWallet"
- assert isinstance(CryptoWalletAddressToTransactions.requires_key(), bool)
+ assert isinstance(CryptoWalletAddressToTransactions.required_params(), bool)
class TestCryptoWalletAddressToNFTs:
@@ -202,7 +202,7 @@ class TestCryptoWalletAddressToNFTs:
"""Test scanner static properties"""
assert CryptoWalletAddressToNFTs.name() == "wallet_to_nfts"
assert CryptoWalletAddressToNFTs.category() == "CryptoWallet"
- assert isinstance(CryptoWalletAddressToNFTs.requires_key(), bool)
+ assert isinstance(CryptoWalletAddressToNFTs.required_params(), bool)
class TestBothCryptoScannersIntegration:
diff --git a/flowsint-api/tests/scanners/test_input_output_migration.py b/flowsint-api/tests/scanners/test_input_output_migration.py
new file mode 100644
index 0000000..0935b78
--- /dev/null
+++ b/flowsint-api/tests/scanners/test_input_output_migration.py
@@ -0,0 +1,336 @@
+"""
+Test migration from old TypeAlias pattern to new InputType/OutputType class attributes.
+
+This test demonstrates the benefits and proper usage of the new pattern.
+"""
+import pytest
+from typing import List, Dict, Any, TypeAlias
+from pydantic import TypeAdapter
+
+from app.scanners.base import Scanner
+from app.types.domain import Domain
+from app.types.ip import Ip
+from app.types.email import Email
+from app.utils import resolve_type
+
+
+class OldPatternScanner(Scanner):
+ """Example of old pattern using module-level TypeAlias"""
+
+ @classmethod
+ def name(cls) -> str:
+ return "old_pattern_scanner"
+
+ @classmethod
+ def category(cls) -> str:
+ return "Test"
+
+ @classmethod
+ def key(cls) -> str:
+ return "domain"
+
+ @classmethod
+ def input_schema(cls) -> Dict[str, Any]:
+ # Old pattern: manually defining TypeAlias and building schema
+ InputType: TypeAlias = List[Domain]
+ adapter = TypeAdapter(InputType)
+ schema = adapter.json_schema()
+ type_name, details = list(schema["$defs"].items())[0]
+ return {
+ "type": type_name,
+ "properties": [
+ {"name": prop, "type": resolve_type(info, schema)}
+ for prop, info in details["properties"].items()
+ ]
+ }
+
+ @classmethod
+ def output_schema(cls) -> Dict[str, Any]:
+ # Old pattern: manually defining TypeAlias and building schema
+ OutputType: TypeAlias = List[Ip]
+ adapter = TypeAdapter(OutputType)
+ schema = adapter.json_schema()
+ type_name, details = list(schema["$defs"].items())[0]
+ return {
+ "type": type_name,
+ "properties": [
+ {"name": prop, "type": resolve_type(info, schema)}
+ for prop, info in details["properties"].items()
+ ]
+ }
+
+ async def scan(self, values: List[str]) -> List[Dict[str, Any]]:
+ return [{"address": "1.2.3.4"}]
+
+
+class NewPatternScanner(Scanner):
+ """Example of new pattern using class attributes with automatic schema generation"""
+
+ # New pattern: just define class attributes - base class handles the rest!
+ InputType = List[Domain]
+ OutputType = List[Ip]
+
+ @classmethod
+ def name(cls) -> str:
+ return "new_pattern_scanner"
+
+ @classmethod
+ def category(cls) -> str:
+ return "Test"
+
+ @classmethod
+ def key(cls) -> str:
+ return "domain"
+
+ # No need to implement input_schema() or output_schema() - base class does it automatically!
+
+ # Methods can use InputType/OutputType directly (once made available at module level)
+ def preprocess(self, data: List[Domain]) -> List[Domain]:
+ # Using concrete type for clarity in test, but would use InputType in real implementation
+ cleaned: List[Domain] = []
+ for item in data:
+ if isinstance(item, Domain):
+ cleaned.append(item)
+ return cleaned
+
+ async def scan(self, values: List[Domain]) -> List[Ip]:
+ # Using concrete type for clarity in test, but would use InputType/OutputType in real implementation
+ results: List[Ip] = []
+ for domain in values:
+ results.append(Ip(address="1.2.3.4"))
+ return results
+
+# Make types available at module level (this is what enables clean usage)
+NewPatternInputType = NewPatternScanner.InputType
+NewPatternOutputType = NewPatternScanner.OutputType
+
+
+class AdvancedNewPatternScanner(Scanner):
+ """Example showing advanced usage with different types"""
+
+ InputType = List[Email]
+ OutputType = List[Domain]
+
+ @classmethod
+ def name(cls) -> str:
+ return "advanced_pattern_scanner"
+
+ @classmethod
+ def category(cls) -> str:
+ return "Test"
+
+ @classmethod
+ def key(cls) -> str:
+ return "email"
+
+ # Schema methods automatic!
+
+ async def scan(self, values: List[str]) -> List[Dict[str, Any]]:
+ return [{"domain": "example.com"}]
+
+
+class TestInputOutputMigrationPattern:
+ """Test migration from old to new pattern"""
+
+ def test_both_patterns_produce_same_schema(self):
+ """Test that old and new patterns produce identical schemas"""
+ old_input_schema = OldPatternScanner.input_schema()
+ new_input_schema = NewPatternScanner.input_schema()
+
+ old_output_schema = OldPatternScanner.output_schema()
+ new_output_schema = NewPatternScanner.output_schema()
+
+ # Schemas should be identical
+ assert old_input_schema == new_input_schema
+ assert old_output_schema == new_output_schema
+
+ def test_new_pattern_benefits_code_reuse(self):
+ """Test that new pattern reduces code duplication"""
+ # With the new pattern, multiple scanners can easily reuse the same logic
+
+ class Scanner1(Scanner):
+ InputType = List[Domain]
+ OutputType = List[Ip]
+
+ @classmethod
+ def name(cls): return "scanner1"
+ @classmethod
+ def category(cls): return "Test"
+ @classmethod
+ def key(cls): return "domain"
+ # No need for input_schema() or output_schema() - automatic!
+ async def scan(self, values): return []
+
+ class Scanner2(Scanner):
+ InputType = List[Domain]
+ OutputType = List[Ip]
+
+ @classmethod
+ def name(cls): return "scanner2"
+ @classmethod
+ def category(cls): return "Test"
+ @classmethod
+ def key(cls): return "domain"
+ # No need for input_schema() or output_schema() - automatic!
+ async def scan(self, values): return []
+
+ # Both should produce identical schemas with minimal code
+ assert Scanner1.input_schema() == Scanner2.input_schema()
+ assert Scanner1.output_schema() == Scanner2.output_schema()
+
+ def test_new_pattern_type_introspection(self):
+ """Test that new pattern allows for better type introspection"""
+ # Can easily check what types a scanner uses
+ assert NewPatternScanner.InputType == List[Domain]
+ assert NewPatternScanner.OutputType == List[Ip]
+
+ assert AdvancedNewPatternScanner.InputType == List[Email]
+ assert AdvancedNewPatternScanner.OutputType == List[Domain]
+
+ # This wasn't easily possible with the old pattern
+
+ def test_new_pattern_inheritance_works(self):
+ """Test that new pattern works well with inheritance"""
+
+ class BaseDomainScanner(Scanner):
+ InputType = List[Domain]
+ OutputType = List[Ip]
+
+ @classmethod
+ def name(cls): return "base_domain"
+ @classmethod
+ def category(cls): return "Test"
+ @classmethod
+ def key(cls): return "domain"
+ # Schema methods automatic!
+ async def scan(self, values): return []
+
+ class SpecializedDomainScanner(BaseDomainScanner):
+ @classmethod
+ def name(cls): return "specialized_domain"
+ # Inherits InputType and OutputType
+
+ # Child should inherit the types and schemas
+ assert SpecializedDomainScanner.InputType == List[Domain]
+ assert SpecializedDomainScanner.OutputType == List[Ip]
+
+ specialized_input = SpecializedDomainScanner.input_schema()
+ specialized_output = SpecializedDomainScanner.output_schema()
+
+ assert specialized_input["type"] == "Domain"
+ assert specialized_output["type"] == "Ip"
+
+ def test_new_pattern_error_handling(self):
+ """Test that new pattern provides better error handling"""
+
+ class IncompleteDomainScanner(Scanner):
+ # Forgot to define InputType and OutputType
+ @classmethod
+ def name(cls): return "incomplete"
+ @classmethod
+ def category(cls): return "Test"
+ @classmethod
+ def key(cls): return "domain"
+ # Base class will try to generate schemas automatically and fail appropriately
+ async def scan(self, values): return []
+
+ # Should get clear error messages
+ with pytest.raises(NotImplementedError) as exc_info:
+ IncompleteDomainScanner.input_schema()
+ assert "InputType must be defined" in str(exc_info.value)
+ assert "IncompleteDomainScanner" in str(exc_info.value)
+
+ with pytest.raises(NotImplementedError) as exc_info:
+ IncompleteDomainScanner.output_schema()
+ assert "OutputType must be defined" in str(exc_info.value)
+ assert "IncompleteDomainScanner" in str(exc_info.value)
+
+ def test_new_pattern_runtime_accessibility(self):
+ """Test that types are accessible at runtime for dynamic operations"""
+
+ # Can build registries or perform operations based on types
+ scanners = [NewPatternScanner, AdvancedNewPatternScanner]
+
+ domain_input_scanners = [
+ scanner for scanner in scanners
+ if hasattr(scanner, 'InputType') and scanner.InputType == List[Domain]
+ ]
+
+ email_input_scanners = [
+ scanner for scanner in scanners
+ if hasattr(scanner, 'InputType') and scanner.InputType == List[Email]
+ ]
+
+ assert len(domain_input_scanners) == 1
+ assert domain_input_scanners[0] == NewPatternScanner
+
+ assert len(email_input_scanners) == 1
+ assert email_input_scanners[0] == AdvancedNewPatternScanner
+
+ def test_new_pattern_with_clean_type_usage(self):
+ """Test that the new pattern allows clean type usage without quotes"""
+ scanner = NewPatternScanner("test", "test")
+
+ # Test that we can create data of the expected types
+ test_domains = [Domain(domain="example.com"), Domain(domain="test.com")]
+
+ # Preprocess should work with clean type annotations
+ result = scanner.preprocess(test_domains)
+ assert len(result) == 2
+ assert all(isinstance(d, Domain) for d in result)
+
+ @pytest.mark.asyncio
+ async def test_new_pattern_async_methods(self):
+ """Test that async methods work correctly with clean type annotations"""
+ scanner = NewPatternScanner("test", "test")
+
+ test_domains = [Domain(domain="example.com")]
+ result = await scanner.scan(test_domains)
+
+ assert len(result) == 1
+ assert isinstance(result[0], Ip)
+ assert result[0].address == "1.2.3.4"
+
+ def test_module_level_type_access(self):
+ """Test that types are properly accessible at module level"""
+ # These should be available after the class definition
+ assert NewPatternInputType == List[Domain]
+ assert NewPatternOutputType == List[Ip]
+
+ # And they should match the class attributes
+ assert NewPatternInputType == NewPatternScanner.InputType
+ assert NewPatternOutputType == NewPatternScanner.OutputType
+
+ def test_migration_checklist(self):
+ """Test that demonstrates a complete migration checklist"""
+
+ # Migration steps:
+ # 1. Define InputType and OutputType as class attributes
+ # 2. Remove input_schema() and output_schema() method implementations (base class handles automatically)
+ # 3. Add module-level assignments: InputType = MyScanner.InputType (optional, for clean usage)
+
+ # Verify the new pattern is simpler and more maintainable
+ new_scanner_benefits = [
+ "Just define InputType and OutputType class attributes",
+ "Automatic schema generation by base class",
+ "No boilerplate schema methods needed",
+ "Consistent schema generation across all scanners",
+ "Clean type usage throughout class methods"
+ ]
+
+ assert len(new_scanner_benefits) == 5
+
+ # Verify functionality is preserved and automatic
+ schema = NewPatternScanner.input_schema()
+ assert schema["type"] == "Domain"
+ assert any(prop["name"] == "domain" for prop in schema["properties"])
+
+ # Verify schemas are generated automatically without manual implementation
+ assert hasattr(NewPatternScanner, 'input_schema')
+ assert hasattr(NewPatternScanner, 'output_schema')
+
+ # The base class should be handling the schema generation
+ input_schema = NewPatternScanner.input_schema()
+ output_schema = NewPatternScanner.output_schema()
+ assert input_schema is not None
+ assert output_schema is not None
\ No newline at end of file
diff --git a/flowsint-api/tests/scanners/test_registry.py b/flowsint-api/tests/scanners/test_registry.py
index c0d32c3..5f8ea6e 100644
--- a/flowsint-api/tests/scanners/test_registry.py
+++ b/flowsint-api/tests/scanners/test_registry.py
@@ -1,8 +1,6 @@
import pytest
from app.scanners.registry import ScannerRegistry
from app.scanners.base import Scanner
-from unittest.mock import Mock
-
class TestScannerRegistry:
"""Test suite for ScannerRegistry functionality"""
@@ -28,7 +26,7 @@ class TestScannerRegistry:
assert "outputs" in scanner_info
assert "params" in scanner_info
assert "params_schema" in scanner_info
- assert "requires_key" in scanner_info
+ assert "required_params" in scanner_info
# Check that name matches the key
assert scanner_info["name"] == name
@@ -55,7 +53,8 @@ class TestScannerRegistry:
# Test with a known input type
domain_scanners = ScannerRegistry.list_by_input_type("Domain")
- for name, scanner_info in domain_scanners.items():
+ assert isinstance(domain_scanners, list)
+ for scanner_info in domain_scanners:
input_type = scanner_info["inputs"]["type"]
assert input_type in ["Any", "Domain"]
@@ -103,7 +102,6 @@ class TestScannerRegistry:
"domain_subdomains_scanner",
"to_whois",
"ip_geolocation_scanner",
- "holehe_scanner",
"maigret_scanner"
]
@@ -126,11 +124,6 @@ class TestScannerRegistry:
"""Test that all scanners have valid categories"""
scanners = ScannerRegistry.list()
- valid_categories = {
- "Domain", "IP", "Email", "Social", "Organization",
- "Website", "Crypto", "Individual", "ASN", "CIDR"
- }
-
for name, scanner_info in scanners.items():
category = scanner_info["category"]
assert isinstance(category, str), f"Scanner '{name}' has invalid category type: {type(category)}"
@@ -141,7 +134,7 @@ class TestScannerRegistry:
"""Test that all scanners have input and output schemas"""
scanners = ScannerRegistry.list()
- for name, scanner_info in scanners.items():
+ for _, scanner_info in scanners.items():
# Check input schema
input_schema = scanner_info["inputs"]
assert isinstance(input_schema, dict)
@@ -154,10 +147,10 @@ class TestScannerRegistry:
assert "type" in output_schema
assert "properties" in output_schema
- def test_scanner_requires_key_is_boolean(self):
- """Test that requires_key returns a boolean for all scanners"""
+ def test_scanner_required_params_is_boolean(self):
+ """Test that required_params returns a boolean for all scanners"""
scanners = ScannerRegistry.list()
for name, scanner_info in scanners.items():
- requires_key = scanner_info["requires_key"]
- assert isinstance(requires_key, bool), f"Scanner '{name}' requires_key is not boolean: {type(requires_key)}"
\ No newline at end of file
+ required_params = scanner_info["required_params"]
+ assert isinstance(required_params, bool), f"Scanner '{name}' required_params is not boolean: {type(required_params)}"
\ No newline at end of file
diff --git a/flowsint-api/tests/scanners/test_schema_generation.py b/flowsint-api/tests/scanners/test_schema_generation.py
new file mode 100644
index 0000000..887bf16
--- /dev/null
+++ b/flowsint-api/tests/scanners/test_schema_generation.py
@@ -0,0 +1,148 @@
+"""
+Test schema generation for scanners with various InputType and OutputType combinations.
+"""
+import pytest
+from typing import List
+from app.scanners.base import Scanner
+from app.types.website import Website
+from app.types.domain import Domain
+from app.types.ip import Ip
+from app.scanners.websites.to_domain import WebsiteToDomainScanner
+from app.scanners.websites.to_webtrackers import WebsiteToWebtrackersScanner
+from app.scanners.websites.to_crawler import WebsiteToCrawler
+from app.scanners.domains.to_website import DomainToWebsiteScanner
+
+
+class TestSchemaGeneration:
+ """Test that schema generation correctly identifies InputType and OutputType."""
+
+ def test_website_to_domain_scanner_schemas(self):
+ """Test that WebsiteToDomainScanner correctly shows Website as input type."""
+ scanner = WebsiteToDomainScanner
+
+ # Test InputType attribute
+ assert scanner.InputType == List[Website]
+
+ # Test input schema generation
+ input_schema = scanner.input_schema()
+ assert input_schema["type"] == "Website", f"Expected 'Website', got '{input_schema['type']}'"
+
+ # Test output schema generation
+ output_schema = scanner.output_schema()
+ assert output_schema["type"] == "Domain", f"Expected 'Domain', got '{output_schema['type']}'"
+
+ def test_website_to_webtrackers_scanner_schemas(self):
+ """Test that WebsiteToWebtrackersScanner correctly shows Website as input type."""
+ scanner = WebsiteToWebtrackersScanner
+
+ # Test InputType attribute
+ assert scanner.InputType == List[Website]
+
+ # Test input schema generation
+ input_schema = scanner.input_schema()
+ assert input_schema["type"] == "Website", f"Expected 'Website', got '{input_schema['type']}'"
+
+ # Test output schema generation
+ output_schema = scanner.output_schema()
+ assert output_schema["type"] == "WebTracker", f"Expected 'WebTracker', got '{output_schema['type']}'"
+
+ def test_website_to_crawler_scanner_schemas(self):
+ """Test that WebsiteToCrawler correctly shows Website as input type."""
+ scanner = WebsiteToCrawler
+
+ # Test InputType attribute
+ assert scanner.InputType == List[Website]
+
+ # Test input schema generation
+ input_schema = scanner.input_schema()
+ assert input_schema["type"] == "Website", f"Expected 'Website', got '{input_schema['type']}'"
+
+ def test_domain_to_website_scanner_schemas(self):
+ """Test that DomainToWebsiteScanner correctly shows Domain as input and Website as output."""
+ scanner = DomainToWebsiteScanner
+
+ # Test InputType attribute
+ assert scanner.InputType == List[Domain]
+
+ # Test input schema generation
+ input_schema = scanner.input_schema()
+ assert input_schema["type"] == "Domain", f"Expected 'Domain', got '{input_schema['type']}'"
+
+ # Test output schema generation
+ output_schema = scanner.output_schema()
+ assert output_schema["type"] == "Website", f"Expected 'Website', got '{output_schema['type']}'"
+
+ def test_all_website_scanners_have_correct_input_types(self):
+ """Test that all scanners taking Website input show Website in schema."""
+ website_input_scanners = [
+ (WebsiteToDomainScanner, "Website", "Domain"),
+ (WebsiteToWebtrackersScanner, "Website", "WebTracker"),
+ (WebsiteToCrawler, "Website", None), # Unknown output type
+ ]
+
+ for scanner_class, expected_input, expected_output in website_input_scanners:
+ input_schema = scanner_class.input_schema()
+ assert input_schema["type"] == expected_input, \
+ f"{scanner_class.__name__}: Expected input '{expected_input}', got '{input_schema['type']}'"
+
+ if expected_output:
+ output_schema = scanner_class.output_schema()
+ assert output_schema["type"] == expected_output, \
+ f"{scanner_class.__name__}: Expected output '{expected_output}', got '{output_schema['type']}'"
+
+ def test_schema_generation_debug_info(self):
+ """Debug test to see what's actually in the schemas."""
+ scanner = WebsiteToDomainScanner
+
+ # Get the raw TypeAdapter schema
+ from pydantic import TypeAdapter
+ adapter = TypeAdapter(scanner.InputType)
+ raw_schema = adapter.json_schema()
+
+ print(f"\n=== Debug Info for {scanner.__name__} ===")
+ print(f"InputType: {scanner.InputType}")
+ print(f"Raw schema keys: {list(raw_schema.keys())}")
+ if "$defs" in raw_schema:
+ print(f"$defs keys: {list(raw_schema['$defs'].keys())}")
+ print(f"Schema items: {raw_schema.get('items', 'No items')}")
+ print(f"Generated input schema: {scanner.input_schema()}")
+
+ # This test always passes, it's just for debugging
+ assert True
+
+ def test_schema_generation_follows_ref_not_first_def(self):
+ """
+ Regression test for the schema generation bug.
+
+ Before the fix: generate_input_schema() picked the first definition in $defs
+ (alphabetically "Domain" came before "Website"), so Website scanners incorrectly
+ showed "Domain" as their input type.
+
+ After the fix: generate_input_schema() follows the $ref in items to get the
+ correct type name.
+ """
+ scanner = WebsiteToDomainScanner
+
+ # Get the raw schema to understand the structure
+ from pydantic import TypeAdapter
+ adapter = TypeAdapter(scanner.InputType)
+ raw_schema = adapter.json_schema()
+
+ # Verify the raw schema structure that caused the bug
+ assert "$defs" in raw_schema
+ assert "Domain" in raw_schema["$defs"]
+ assert "Website" in raw_schema["$defs"]
+ assert raw_schema["items"]["$ref"] == "#/$defs/Website"
+
+ # Before fix: list(raw_schema["$defs"].items())[0][0] would be "Domain" (first alphabetically)
+ first_def_name = list(raw_schema["$defs"].items())[0][0]
+ assert first_def_name == "Domain" # This would have been the bug
+
+ # After fix: We follow the $ref to get "Website"
+ ref_type = raw_schema["items"]["$ref"].split("/")[-1]
+ assert ref_type == "Website" # This is what we should use
+
+ # Verify our fix works correctly
+ input_schema = scanner.input_schema()
+ assert input_schema["type"] == "Website", \
+ f"Schema generation should follow $ref, not pick first def. Got '{input_schema['type']}'"
\ No newline at end of file
diff --git a/flowsint-app/package.json b/flowsint-app/package.json
index 48c6401..68071c5 100644
--- a/flowsint-app/package.json
+++ b/flowsint-app/package.json
@@ -60,6 +60,7 @@
"@radix-ui/react-toggle-group": "^1.1.10",
"@radix-ui/react-tooltip": "^1.2.7",
"@react-sigma/core": "^5.0.4",
+ "@react-sigma/layout-forceatlas2": "^5.0.4",
"@tailwindcss/vite": "^3.4.1",
"@tanstack/react-query": "^5.79.0",
"@tanstack/react-table": "^8.21.3",
diff --git a/flowsint-app/src/renderer/public/icons/n8n.svg b/flowsint-app/src/renderer/public/icons/n8n.svg
new file mode 100644
index 0000000..82f0a6d
--- /dev/null
+++ b/flowsint-app/src/renderer/public/icons/n8n.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/flowsint-app/src/renderer/src/components/chat/floating-chat.tsx b/flowsint-app/src/renderer/src/components/chat/floating-chat.tsx
index c99f3e8..e8992eb 100644
--- a/flowsint-app/src/renderer/src/components/chat/floating-chat.tsx
+++ b/flowsint-app/src/renderer/src/components/chat/floating-chat.tsx
@@ -296,7 +296,7 @@ const ChatMessageComponent = ({ message }: { message: ChatMessage }) => {
)}>
{data.payload.name}
+ {data.value}%
+ {title} {value} {trend.period}Active Malware
+ Targeted Sectors
+
+ Your investigation graph will come to life here. Add nodes, discover relationships, + and uncover hidden patterns in your data. +
++ Add your first node to begin +
+No logs to display
+Events will appear here as they happen
{data.doc}
{scanner.doc}