refac

2026-05-03 10:49:21 -05:00 · 2026-03-17 17:58:01 -05:00
parent fcf7208352
commit de3317e26b
220 changed files with 17200 additions and 22836 deletions
--- a/backend/open_webui/retrieval/web/utils.py
+++ b/backend/open_webui/retrieval/web/utils.py
@@ -67,16 +67,14 @@ def validate_url(url: Union[str, Sequence[str]]):
        parsed_url = urllib.parse.urlparse(url)

        # Protocol validation - only allow http/https
-        if parsed_url.scheme not in ["http", "https"]:
-            log.warning(
-                f"Blocked non-HTTP(S) protocol: {parsed_url.scheme} in URL: {url}"
-            )
+        if parsed_url.scheme not in ['http', 'https']:
+            log.warning(f'Blocked non-HTTP(S) protocol: {parsed_url.scheme} in URL: {url}')
            raise ValueError(ERROR_MESSAGES.INVALID_URL)

        # Blocklist check using unified filtering logic
        if WEB_FETCH_FILTER_LIST:
            if not is_string_allowed(url, WEB_FETCH_FILTER_LIST):
-                log.warning(f"URL blocked by filter list: {url}")
+                log.warning(f'URL blocked by filter list: {url}')
                raise ValueError(ERROR_MESSAGES.INVALID_URL)

        if not ENABLE_RAG_LOCAL_WEB_FETCH:
@@ -106,29 +104,29 @@ def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
            if validate_url(u):
                valid_urls.append(u)
        except Exception as e:
-            log.debug(f"Invalid URL {u}: {str(e)}")
+            log.debug(f'Invalid URL {u}: {str(e)}')
            continue
    return valid_urls


 def extract_metadata(soup, url):
-    metadata = {"source": url}
-    if title := soup.find("title"):
-        metadata["title"] = title.get_text()
-    if description := soup.find("meta", attrs={"name": "description"}):
-        metadata["description"] = description.get("content", "No description found.")
-    if html := soup.find("html"):
-        metadata["language"] = html.get("lang", "No language found.")
+    metadata = {'source': url}
+    if title := soup.find('title'):
+        metadata['title'] = title.get_text()
+    if description := soup.find('meta', attrs={'name': 'description'}):
+        metadata['description'] = description.get('content', 'No description found.')
+    if html := soup.find('html'):
+        metadata['language'] = html.get('lang', 'No language found.')
    return metadata


 def verify_ssl_cert(url: str) -> bool:
    """Verify SSL certificate for the given URL."""
-    if not url.startswith("https://"):
+    if not url.startswith('https://'):
        return True

    try:
-        hostname = url.split("://")[-1].split("/")[0]
+        hostname = url.split('://')[-1].split('/')[0]
        context = ssl.create_default_context(cafile=certifi.where())
        with context.wrap_socket(ssl.socket(), server_hostname=hostname) as s:
            s.connect((hostname, 443))
@@ -136,7 +134,7 @@ def verify_ssl_cert(url: str) -> bool:
    except ssl.SSLError:
        return False
    except Exception as e:
-        log.warning(f"SSL verification failed for {url}: {str(e)}")
+        log.warning(f'SSL verification failed for {url}: {str(e)}')
        return False


@@ -168,14 +166,14 @@ class URLProcessingMixin:
    async def _safe_process_url(self, url: str) -> bool:
        """Perform safety checks before processing a URL."""
        if self.verify_ssl and not await self._verify_ssl_cert(url):
-            raise ValueError(f"SSL certificate verification failed for {url}")
+            raise ValueError(f'SSL certificate verification failed for {url}')
        await self._wait_for_rate_limit()
        return True

    def _safe_process_url_sync(self, url: str) -> bool:
        """Synchronous version of safety checks."""
        if self.verify_ssl and not verify_ssl_cert(url):
-            raise ValueError(f"SSL certificate verification failed for {url}")
+            raise ValueError(f'SSL certificate verification failed for {url}')
        self._sync_wait_for_rate_limit()
        return True

@@ -191,7 +189,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
        api_key: Optional[str] = None,
        api_url: Optional[str] = None,
        timeout: Optional[int] = None,
-        mode: Literal["crawl", "scrape", "map"] = "scrape",
+        mode: Literal['crawl', 'scrape', 'map'] = 'scrape',
        proxy: Optional[Dict[str, str]] = None,
        params: Optional[Dict] = None,
    ):
@@ -216,15 +214,15 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
            params: The parameters to pass to the Firecrawl API.
                For more details, visit: https://docs.firecrawl.dev/sdks/python#batch-scrape
        """
-        proxy_server = proxy.get("server") if proxy else None
+        proxy_server = proxy.get('server') if proxy else None
        if trust_env and not proxy_server:
            env_proxies = urllib.request.getproxies()
-            env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
+            env_proxy_server = env_proxies.get('https') or env_proxies.get('http')
            if env_proxy_server:
                if proxy:
-                    proxy["server"] = env_proxy_server
+                    proxy['server'] = env_proxy_server
                else:
-                    proxy = {"server": env_proxy_server}
+                    proxy = {'server': env_proxy_server}
        self.web_paths = web_paths
        self.verify_ssl = verify_ssl
        self.requests_per_second = requests_per_second
@@ -240,7 +238,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
    def lazy_load(self) -> Iterator[Document]:
        """Load documents using FireCrawl batch_scrape."""
        log.debug(
-            "Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
+            'Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s',
            len(self.web_paths),
            self.mode,
            self.params,
@@ -251,7 +249,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
            firecrawl = FirecrawlApp(api_key=self.api_key, api_url=self.api_url)
            result = firecrawl.batch_scrape(
                self.web_paths,
-                formats=["markdown"],
+                formats=['markdown'],
                skip_tls_verification=not self.verify_ssl,
                ignore_invalid_urls=True,
                remove_base64_images=True,
@@ -260,28 +258,26 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
                **self.params,
            )

-            if result.status != "completed":
-                raise RuntimeError(
-                    f"FireCrawl batch scrape did not complete successfully. result: {result}"
-                )
+            if result.status != 'completed':
+                raise RuntimeError(f'FireCrawl batch scrape did not complete successfully. result: {result}')

            for data in result.data:
                metadata = data.metadata or {}
                yield Document(
-                    page_content=data.markdown or "",
-                    metadata={"source": metadata.url or metadata.source_url or ""},
+                    page_content=data.markdown or '',
+                    metadata={'source': metadata.url or metadata.source_url or ''},
                )

        except Exception as e:
            if self.continue_on_failure:
-                log.exception(f"Error extracting content from URLs: {e}")
+                log.exception(f'Error extracting content from URLs: {e}')
            else:
                raise e

    async def alazy_load(self):
        """Async version of lazy_load."""
        log.debug(
-            "Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
+            'Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s',
            len(self.web_paths),
            self.mode,
            self.params,
@@ -292,7 +288,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
            firecrawl = FirecrawlApp(api_key=self.api_key, api_url=self.api_url)
            result = firecrawl.batch_scrape(
                self.web_paths,
-                formats=["markdown"],
+                formats=['markdown'],
                skip_tls_verification=not self.verify_ssl,
                ignore_invalid_urls=True,
                remove_base64_images=True,
@@ -301,21 +297,19 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
                **self.params,
            )

-            if result.status != "completed":
-                raise RuntimeError(
-                    f"FireCrawl batch scrape did not complete successfully. result: {result}"
-                )
+            if result.status != 'completed':
+                raise RuntimeError(f'FireCrawl batch scrape did not complete successfully. result: {result}')

            for data in result.data:
                metadata = data.metadata or {}
                yield Document(
-                    page_content=data.markdown or "",
-                    metadata={"source": metadata.url or metadata.source_url or ""},
+                    page_content=data.markdown or '',
+                    metadata={'source': metadata.url or metadata.source_url or ''},
                )

        except Exception as e:
            if self.continue_on_failure:
-                log.exception(f"Error extracting content from URLs: {e}")
+                log.exception(f'Error extracting content from URLs: {e}')
            else:
                raise e

@@ -325,7 +319,7 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
        self,
        web_paths: Union[str, List[str]],
        api_key: str,
-        extract_depth: Literal["basic", "advanced"] = "basic",
+        extract_depth: Literal['basic', 'advanced'] = 'basic',
        continue_on_failure: bool = True,
        requests_per_second: Optional[float] = None,
        verify_ssl: bool = True,
@@ -345,15 +339,15 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
            proxy: Optional proxy configuration.
        """
        # Initialize proxy configuration if using environment variables
-        proxy_server = proxy.get("server") if proxy else None
+        proxy_server = proxy.get('server') if proxy else None
        if trust_env and not proxy_server:
            env_proxies = urllib.request.getproxies()
-            env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
+            env_proxy_server = env_proxies.get('https') or env_proxies.get('http')
            if env_proxy_server:
                if proxy:
-                    proxy["server"] = env_proxy_server
+                    proxy['server'] = env_proxy_server
                else:
-                    proxy = {"server": env_proxy_server}
+                    proxy = {'server': env_proxy_server}

        # Store parameters for creating TavilyLoader instances
        self.web_paths = web_paths if isinstance(web_paths, list) else [web_paths]
@@ -376,14 +370,14 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
                self._safe_process_url_sync(url)
                valid_urls.append(url)
            except Exception as e:
-                log.warning(f"SSL verification failed for {url}: {str(e)}")
+                log.warning(f'SSL verification failed for {url}: {str(e)}')
                if not self.continue_on_failure:
                    raise e
        if not valid_urls:
            if self.continue_on_failure:
-                log.warning("No valid URLs to process after SSL verification")
+                log.warning('No valid URLs to process after SSL verification')
                return
-            raise ValueError("No valid URLs to process after SSL verification")
+            raise ValueError('No valid URLs to process after SSL verification')
        try:
            loader = TavilyLoader(
                urls=valid_urls,
@@ -394,7 +388,7 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
            yield from loader.lazy_load()
        except Exception as e:
            if self.continue_on_failure:
-                log.exception(f"Error extracting content from URLs: {e}")
+                log.exception(f'Error extracting content from URLs: {e}')
            else:
                raise e

@@ -406,15 +400,15 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
                await self._safe_process_url(url)
                valid_urls.append(url)
            except Exception as e:
-                log.warning(f"SSL verification failed for {url}: {str(e)}")
+                log.warning(f'SSL verification failed for {url}: {str(e)}')
                if not self.continue_on_failure:
                    raise e

        if not valid_urls:
            if self.continue_on_failure:
-                log.warning("No valid URLs to process after SSL verification")
+                log.warning('No valid URLs to process after SSL verification')
                return
-            raise ValueError("No valid URLs to process after SSL verification")
+            raise ValueError('No valid URLs to process after SSL verification')

        try:
            loader = TavilyLoader(
@@ -427,7 +421,7 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
                yield document
        except Exception as e:
            if self.continue_on_failure:
-                log.exception(f"Error loading URLs: {e}")
+                log.exception(f'Error loading URLs: {e}')
            else:
                raise e

@@ -462,15 +456,15 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
    ):
        """Initialize with additional safety parameters and remote browser support."""

-        proxy_server = proxy.get("server") if proxy else None
+        proxy_server = proxy.get('server') if proxy else None
        if trust_env and not proxy_server:
            env_proxies = urllib.request.getproxies()
-            env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
+            env_proxy_server = env_proxies.get('https') or env_proxies.get('http')
            if env_proxy_server:
                if proxy:
-                    proxy["server"] = env_proxy_server
+                    proxy['server'] = env_proxy_server
                else:
-                    proxy = {"server": env_proxy_server}
+                    proxy = {'server': env_proxy_server}

        # We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
        super().__init__(
@@ -504,14 +498,14 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
                    page = browser.new_page()
                    response = page.goto(url, timeout=self.playwright_timeout)
                    if response is None:
-                        raise ValueError(f"page.goto() returned None for url {url}")
+                        raise ValueError(f'page.goto() returned None for url {url}')

                    text = self.evaluator.evaluate(page, browser, response)
-                    metadata = {"source": url}
+                    metadata = {'source': url}
                    yield Document(page_content=text, metadata=metadata)
                except Exception as e:
                    if self.continue_on_failure:
-                        log.exception(f"Error loading {url}: {e}")
+                        log.exception(f'Error loading {url}: {e}')
                        continue
                    raise e
            browser.close()
@@ -525,9 +519,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
            if self.playwright_ws_url:
                browser = await p.chromium.connect(self.playwright_ws_url)
            else:
-                browser = await p.chromium.launch(
-                    headless=self.headless, proxy=self.proxy
-                )
+                browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)

            for url in self.urls:
                try:
@@ -535,14 +527,14 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
                    page = await browser.new_page()
                    response = await page.goto(url, timeout=self.playwright_timeout)
                    if response is None:
-                        raise ValueError(f"page.goto() returned None for url {url}")
+                        raise ValueError(f'page.goto() returned None for url {url}')

                    text = await self.evaluator.evaluate_async(page, browser, response)
-                    metadata = {"source": url}
+                    metadata = {'source': url}
                    yield Document(page_content=text, metadata=metadata)
                except Exception as e:
                    if self.continue_on_failure:
-                        log.exception(f"Error loading {url}: {e}")
+                        log.exception(f'Error loading {url}: {e}')
                        continue
                    raise e
            await browser.close()
@@ -560,9 +552,7 @@ class SafeWebBaseLoader(WebBaseLoader):
        super().__init__(*args, **kwargs)
        self.trust_env = trust_env

-    async def _fetch(
-        self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
-    ) -> str:
+    async def _fetch(self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5) -> str:
        async with aiohttp.ClientSession(trust_env=self.trust_env) as session:
            for i in range(retries):
                try:
@@ -571,7 +561,7 @@ class SafeWebBaseLoader(WebBaseLoader):
                        cookies=self.session.cookies.get_dict(),
                    )
                    if not self.session.verify:
-                        kwargs["ssl"] = False
+                        kwargs['ssl'] = False

                    async with session.get(
                        url,
@@ -585,16 +575,11 @@ class SafeWebBaseLoader(WebBaseLoader):
                    if i == retries - 1:
                        raise
                    else:
-                        log.warning(
-                            f"Error fetching {url} with attempt "
-                            f"{i + 1}/{retries}: {e}. Retrying..."
-                        )
+                        log.warning(f'Error fetching {url} with attempt {i + 1}/{retries}: {e}. Retrying...')
                        await asyncio.sleep(cooldown * backoff**i)
-        raise ValueError("retry count exceeded")
+        raise ValueError('retry count exceeded')

-    def _unpack_fetch_results(
-        self, results: Any, urls: List[str], parser: Union[str, None] = None
-    ) -> List[Any]:
+    def _unpack_fetch_results(self, results: Any, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
        """Unpack fetch results into BeautifulSoup objects."""
        from bs4 import BeautifulSoup

@@ -602,17 +587,15 @@ class SafeWebBaseLoader(WebBaseLoader):
        for i, result in enumerate(results):
            url = urls[i]
            if parser is None:
-                if url.endswith(".xml"):
-                    parser = "xml"
+                if url.endswith('.xml'):
+                    parser = 'xml'
                else:
                    parser = self.default_parser
                self._check_parser(parser)
            final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
        return final_results

-    async def ascrape_all(
-        self, urls: List[str], parser: Union[str, None] = None
-    ) -> List[Any]:
+    async def ascrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
        """Async fetch all urls, then return soups for all results."""
        results = await self.fetch_all(urls)
        return self._unpack_fetch_results(results, urls, parser=parser)
@@ -630,22 +613,20 @@ class SafeWebBaseLoader(WebBaseLoader):
                yield Document(page_content=text, metadata=metadata)
            except Exception as e:
                # Log the error and continue with the next URL
-                log.exception(f"Error loading {path}: {e}")
+                log.exception(f'Error loading {path}: {e}')

    async def alazy_load(self) -> AsyncIterator[Document]:
        """Async lazy load text from the url(s) in web_path."""
        results = await self.ascrape_all(self.web_paths)
        for path, soup in zip(self.web_paths, results):
            text = soup.get_text(**self.bs_get_text_kwargs)
-            metadata = {"source": path}
-            if title := soup.find("title"):
-                metadata["title"] = title.get_text()
-            if description := soup.find("meta", attrs={"name": "description"}):
-                metadata["description"] = description.get(
-                    "content", "No description found."
-                )
-            if html := soup.find("html"):
-                metadata["language"] = html.get("lang", "No language found.")
+            metadata = {'source': path}
+            if title := soup.find('title'):
+                metadata['title'] = title.get_text()
+            if description := soup.find('meta', attrs={'name': 'description'}):
+                metadata['description'] = description.get('content', 'No description found.')
+            if html := soup.find('html'):
+                metadata['language'] = html.get('lang', 'No language found.')
            yield Document(page_content=text, metadata=metadata)

    async def aload(self) -> list[Document]:
@@ -663,18 +644,18 @@ def get_web_loader(
    safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)

    if not safe_urls:
-        log.warning(f"All provided URLs were blocked or invalid: {urls}")
+        log.warning(f'All provided URLs were blocked or invalid: {urls}')
        raise ValueError(ERROR_MESSAGES.INVALID_URL)

    web_loader_args = {
-        "web_paths": safe_urls,
-        "verify_ssl": verify_ssl,
-        "requests_per_second": requests_per_second,
-        "continue_on_failure": True,
-        "trust_env": trust_env,
+        'web_paths': safe_urls,
+        'verify_ssl': verify_ssl,
+        'requests_per_second': requests_per_second,
+        'continue_on_failure': True,
+        'trust_env': trust_env,
    }

-    if WEB_LOADER_ENGINE.value == "" or WEB_LOADER_ENGINE.value == "safe_web":
+    if WEB_LOADER_ENGINE.value == '' or WEB_LOADER_ENGINE.value == 'safe_web':
        WebLoaderClass = SafeWebBaseLoader

        request_kwargs = {}
@@ -685,42 +666,42 @@ def get_web_loader(
                timeout_value = None

            if timeout_value:
-                request_kwargs["timeout"] = timeout_value
+                request_kwargs['timeout'] = timeout_value

        if request_kwargs:
-            web_loader_args["requests_kwargs"] = request_kwargs
+            web_loader_args['requests_kwargs'] = request_kwargs

-    if WEB_LOADER_ENGINE.value == "playwright":
+    if WEB_LOADER_ENGINE.value == 'playwright':
        WebLoaderClass = SafePlaywrightURLLoader
-        web_loader_args["playwright_timeout"] = PLAYWRIGHT_TIMEOUT.value
+        web_loader_args['playwright_timeout'] = PLAYWRIGHT_TIMEOUT.value
        if PLAYWRIGHT_WS_URL.value:
-            web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URL.value
+            web_loader_args['playwright_ws_url'] = PLAYWRIGHT_WS_URL.value

-    if WEB_LOADER_ENGINE.value == "firecrawl":
+    if WEB_LOADER_ENGINE.value == 'firecrawl':
        WebLoaderClass = SafeFireCrawlLoader
-        web_loader_args["api_key"] = FIRECRAWL_API_KEY.value
-        web_loader_args["api_url"] = FIRECRAWL_API_BASE_URL.value
+        web_loader_args['api_key'] = FIRECRAWL_API_KEY.value
+        web_loader_args['api_url'] = FIRECRAWL_API_BASE_URL.value
        if FIRECRAWL_TIMEOUT.value:
            try:
-                web_loader_args["timeout"] = int(FIRECRAWL_TIMEOUT.value)
+                web_loader_args['timeout'] = int(FIRECRAWL_TIMEOUT.value)
            except ValueError:
                pass

-    if WEB_LOADER_ENGINE.value == "tavily":
+    if WEB_LOADER_ENGINE.value == 'tavily':
        WebLoaderClass = SafeTavilyLoader
-        web_loader_args["api_key"] = TAVILY_API_KEY.value
-        web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value
+        web_loader_args['api_key'] = TAVILY_API_KEY.value
+        web_loader_args['extract_depth'] = TAVILY_EXTRACT_DEPTH.value

-    if WEB_LOADER_ENGINE.value == "external":
+    if WEB_LOADER_ENGINE.value == 'external':
        WebLoaderClass = ExternalWebLoader
-        web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value
-        web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value
+        web_loader_args['external_url'] = EXTERNAL_WEB_LOADER_URL.value
+        web_loader_args['external_api_key'] = EXTERNAL_WEB_LOADER_API_KEY.value

    if WebLoaderClass:
        web_loader = WebLoaderClass(**web_loader_args)

        log.debug(
-            "Using WEB_LOADER_ENGINE %s for %s URLs",
+            'Using WEB_LOADER_ENGINE %s for %s URLs',
            web_loader.__class__.__name__,
            len(safe_urls),
        )
@@ -728,6 +709,6 @@ def get_web_loader(
        return web_loader
    else:
        raise ValueError(
-            f"Invalid WEB_LOADER_ENGINE: {WEB_LOADER_ENGINE.value}. "
+            f'Invalid WEB_LOADER_ENGINE: {WEB_LOADER_ENGINE.value}. '
            "Please set it to 'safe_web', 'playwright', 'firecrawl', or 'tavily'."
        )