Merge remote-tracking branch 'upstream/dev' into playwright

# Conflicts: # backend/open_webui/config.py # backend/open_webui/main.py # backend/open_webui/retrieval/web/utils.py # backend/open_webui/routers/retrieval.py # backend/open_webui/utils/middleware.py # pyproject.toml
2026-05-03 18:59:38 -05:00 · 2025-02-14 20:48:22 -06:00
parent 40d4db97e6 74c8690cd1
commit 4da220c513
92 changed files with 2583 additions and 454 deletions
--- a/backend/open_webui/retrieval/web/utils.py
+++ b/backend/open_webui/retrieval/web/utils.py
@@ -2,11 +2,15 @@ import asyncio
 from datetime import datetime, time, timedelta
 import socket
 import ssl
+import aiohttp
+import asyncio
 import urllib.parse
 import certifi
 import validators
 from collections import defaultdict
 from typing import AsyncIterator, Dict, List, Optional, Union, Sequence, Iterator
+from typing import Any, AsyncIterator, Dict, Iterator, List, Sequence, Union
+

 from langchain_community.document_loaders import (
    WebBaseLoader,
@@ -230,6 +234,71 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader):
 class SafeWebBaseLoader(WebBaseLoader):
    """WebBaseLoader with enhanced error handling for URLs."""

+    def __init__(self, trust_env: bool = False, *args, **kwargs):
+        """Initialize SafeWebBaseLoader
+        Args:
+            trust_env (bool, optional): set to True if using proxy to make web requests, for example
+                using http(s)_proxy environment variables. Defaults to False.
+        """
+        super().__init__(*args, **kwargs)
+        self.trust_env = trust_env
+
+    async def _fetch(
+        self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
+    ) -> str:
+        async with aiohttp.ClientSession(trust_env=self.trust_env) as session:
+            for i in range(retries):
+                try:
+                    kwargs: Dict = dict(
+                        headers=self.session.headers,
+                        cookies=self.session.cookies.get_dict(),
+                    )
+                    if not self.session.verify:
+                        kwargs["ssl"] = False
+
+                    async with session.get(
+                        url, **(self.requests_kwargs | kwargs)
+                    ) as response:
+                        if self.raise_for_status:
+                            response.raise_for_status()
+                        return await response.text()
+                except aiohttp.ClientConnectionError as e:
+                    if i == retries - 1:
+                        raise
+                    else:
+                        log.warning(
+                            f"Error fetching {url} with attempt "
+                            f"{i + 1}/{retries}: {e}. Retrying..."
+                        )
+                        await asyncio.sleep(cooldown * backoff**i)
+        raise ValueError("retry count exceeded")
+
+    def _unpack_fetch_results(
+        self, results: Any, urls: List[str], parser: Union[str, None] = None
+    ) -> List[Any]:
+        """Unpack fetch results into BeautifulSoup objects."""
+        from bs4 import BeautifulSoup
+
+        final_results = []
+        for i, result in enumerate(results):
+            url = urls[i]
+            if parser is None:
+                if url.endswith(".xml"):
+                    parser = "xml"
+                else:
+                    parser = self.default_parser
+                self._check_parser(parser)
+            final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
+        return final_results
+
+    async def ascrape_all(
+        self, urls: List[str], parser: Union[str, None] = None
+    ) -> List[Any]:
+        """Async fetch all urls, then return soups for all results."""
+        results = await self.fetch_all(urls)
+        return self._unpack_fetch_results(results, urls, parser=parser)
+
+
    def lazy_load(self) -> Iterator[Document]:
        """Lazy load text from the url(s) in web_path with error handling."""
        for path in self.web_paths:
@@ -245,6 +314,26 @@ class SafeWebBaseLoader(WebBaseLoader):
                # Log the error and continue with the next URL
                log.exception(e, "Error loading %s", path)

+    async def alazy_load(self) -> AsyncIterator[Document]:
+        """Async lazy load text from the url(s) in web_path."""
+        results = await self.ascrape_all(self.web_paths)
+        for path, soup in zip(self.web_paths, results):
+            text = soup.get_text(**self.bs_get_text_kwargs)
+            metadata = {"source": path}
+            if title := soup.find("title"):
+                metadata["title"] = title.get_text()
+            if description := soup.find("meta", attrs={"name": "description"}):
+                metadata["description"] = description.get(
+                    "content", "No description found."
+                )
+            if html := soup.find("html"):
+                metadata["language"] = html.get("lang", "No language found.")
+            yield Document(page_content=text, metadata=metadata)
+
+    async def aload(self) -> list[Document]:
+        """Load data into Document objects."""
+        return [document async for document in self.alazy_load()]
+
 RAG_WEB_LOADERS = defaultdict(lambda: SafeWebBaseLoader)
 RAG_WEB_LOADERS["playwright"] = SafePlaywrightURLLoader
 RAG_WEB_LOADERS["safe_web"] = SafeWebBaseLoader
@@ -253,16 +342,19 @@ def get_web_loader(
    urls: Union[str, Sequence[str]],
    verify_ssl: bool = True,
    requests_per_second: int = 2,
+    trust_env: bool = False,
 ):
    # Check if the URLs are valid
    safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)


    web_loader_args = {
+        web_path=safe_urls,
        "urls": safe_urls,
        "verify_ssl": verify_ssl,
        "requests_per_second": requests_per_second,
-        "continue_on_failure": True
+        "continue_on_failure": True,
+        trust_env=trust_env
    }

    if PLAYWRIGHT_WS_URI.value: