This commit is contained in:
Timothy Jaeryang Baek
2026-03-17 17:58:01 -05:00
parent fcf7208352
commit de3317e26b
220 changed files with 17200 additions and 22836 deletions

View File

@@ -67,16 +67,14 @@ def validate_url(url: Union[str, Sequence[str]]):
parsed_url = urllib.parse.urlparse(url)
# Protocol validation - only allow http/https
if parsed_url.scheme not in ["http", "https"]:
log.warning(
f"Blocked non-HTTP(S) protocol: {parsed_url.scheme} in URL: {url}"
)
if parsed_url.scheme not in ['http', 'https']:
log.warning(f'Blocked non-HTTP(S) protocol: {parsed_url.scheme} in URL: {url}')
raise ValueError(ERROR_MESSAGES.INVALID_URL)
# Blocklist check using unified filtering logic
if WEB_FETCH_FILTER_LIST:
if not is_string_allowed(url, WEB_FETCH_FILTER_LIST):
log.warning(f"URL blocked by filter list: {url}")
log.warning(f'URL blocked by filter list: {url}')
raise ValueError(ERROR_MESSAGES.INVALID_URL)
if not ENABLE_RAG_LOCAL_WEB_FETCH:
@@ -106,29 +104,29 @@ def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
if validate_url(u):
valid_urls.append(u)
except Exception as e:
log.debug(f"Invalid URL {u}: {str(e)}")
log.debug(f'Invalid URL {u}: {str(e)}')
continue
return valid_urls
def extract_metadata(soup, url):
metadata = {"source": url}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get("content", "No description found.")
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
metadata = {'source': url}
if title := soup.find('title'):
metadata['title'] = title.get_text()
if description := soup.find('meta', attrs={'name': 'description'}):
metadata['description'] = description.get('content', 'No description found.')
if html := soup.find('html'):
metadata['language'] = html.get('lang', 'No language found.')
return metadata
def verify_ssl_cert(url: str) -> bool:
"""Verify SSL certificate for the given URL."""
if not url.startswith("https://"):
if not url.startswith('https://'):
return True
try:
hostname = url.split("://")[-1].split("/")[0]
hostname = url.split('://')[-1].split('/')[0]
context = ssl.create_default_context(cafile=certifi.where())
with context.wrap_socket(ssl.socket(), server_hostname=hostname) as s:
s.connect((hostname, 443))
@@ -136,7 +134,7 @@ def verify_ssl_cert(url: str) -> bool:
except ssl.SSLError:
return False
except Exception as e:
log.warning(f"SSL verification failed for {url}: {str(e)}")
log.warning(f'SSL verification failed for {url}: {str(e)}')
return False
@@ -168,14 +166,14 @@ class URLProcessingMixin:
async def _safe_process_url(self, url: str) -> bool:
"""Perform safety checks before processing a URL."""
if self.verify_ssl and not await self._verify_ssl_cert(url):
raise ValueError(f"SSL certificate verification failed for {url}")
raise ValueError(f'SSL certificate verification failed for {url}')
await self._wait_for_rate_limit()
return True
def _safe_process_url_sync(self, url: str) -> bool:
"""Synchronous version of safety checks."""
if self.verify_ssl and not verify_ssl_cert(url):
raise ValueError(f"SSL certificate verification failed for {url}")
raise ValueError(f'SSL certificate verification failed for {url}')
self._sync_wait_for_rate_limit()
return True
@@ -191,7 +189,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
api_key: Optional[str] = None,
api_url: Optional[str] = None,
timeout: Optional[int] = None,
mode: Literal["crawl", "scrape", "map"] = "scrape",
mode: Literal['crawl', 'scrape', 'map'] = 'scrape',
proxy: Optional[Dict[str, str]] = None,
params: Optional[Dict] = None,
):
@@ -216,15 +214,15 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
params: The parameters to pass to the Firecrawl API.
For more details, visit: https://docs.firecrawl.dev/sdks/python#batch-scrape
"""
proxy_server = proxy.get("server") if proxy else None
proxy_server = proxy.get('server') if proxy else None
if trust_env and not proxy_server:
env_proxies = urllib.request.getproxies()
env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
env_proxy_server = env_proxies.get('https') or env_proxies.get('http')
if env_proxy_server:
if proxy:
proxy["server"] = env_proxy_server
proxy['server'] = env_proxy_server
else:
proxy = {"server": env_proxy_server}
proxy = {'server': env_proxy_server}
self.web_paths = web_paths
self.verify_ssl = verify_ssl
self.requests_per_second = requests_per_second
@@ -240,7 +238,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
def lazy_load(self) -> Iterator[Document]:
"""Load documents using FireCrawl batch_scrape."""
log.debug(
"Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
'Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s',
len(self.web_paths),
self.mode,
self.params,
@@ -251,7 +249,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
firecrawl = FirecrawlApp(api_key=self.api_key, api_url=self.api_url)
result = firecrawl.batch_scrape(
self.web_paths,
formats=["markdown"],
formats=['markdown'],
skip_tls_verification=not self.verify_ssl,
ignore_invalid_urls=True,
remove_base64_images=True,
@@ -260,28 +258,26 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
**self.params,
)
if result.status != "completed":
raise RuntimeError(
f"FireCrawl batch scrape did not complete successfully. result: {result}"
)
if result.status != 'completed':
raise RuntimeError(f'FireCrawl batch scrape did not complete successfully. result: {result}')
for data in result.data:
metadata = data.metadata or {}
yield Document(
page_content=data.markdown or "",
metadata={"source": metadata.url or metadata.source_url or ""},
page_content=data.markdown or '',
metadata={'source': metadata.url or metadata.source_url or ''},
)
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error extracting content from URLs: {e}")
log.exception(f'Error extracting content from URLs: {e}')
else:
raise e
async def alazy_load(self):
"""Async version of lazy_load."""
log.debug(
"Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
'Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s',
len(self.web_paths),
self.mode,
self.params,
@@ -292,7 +288,7 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
firecrawl = FirecrawlApp(api_key=self.api_key, api_url=self.api_url)
result = firecrawl.batch_scrape(
self.web_paths,
formats=["markdown"],
formats=['markdown'],
skip_tls_verification=not self.verify_ssl,
ignore_invalid_urls=True,
remove_base64_images=True,
@@ -301,21 +297,19 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
**self.params,
)
if result.status != "completed":
raise RuntimeError(
f"FireCrawl batch scrape did not complete successfully. result: {result}"
)
if result.status != 'completed':
raise RuntimeError(f'FireCrawl batch scrape did not complete successfully. result: {result}')
for data in result.data:
metadata = data.metadata or {}
yield Document(
page_content=data.markdown or "",
metadata={"source": metadata.url or metadata.source_url or ""},
page_content=data.markdown or '',
metadata={'source': metadata.url or metadata.source_url or ''},
)
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error extracting content from URLs: {e}")
log.exception(f'Error extracting content from URLs: {e}')
else:
raise e
@@ -325,7 +319,7 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
self,
web_paths: Union[str, List[str]],
api_key: str,
extract_depth: Literal["basic", "advanced"] = "basic",
extract_depth: Literal['basic', 'advanced'] = 'basic',
continue_on_failure: bool = True,
requests_per_second: Optional[float] = None,
verify_ssl: bool = True,
@@ -345,15 +339,15 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
proxy: Optional proxy configuration.
"""
# Initialize proxy configuration if using environment variables
proxy_server = proxy.get("server") if proxy else None
proxy_server = proxy.get('server') if proxy else None
if trust_env and not proxy_server:
env_proxies = urllib.request.getproxies()
env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
env_proxy_server = env_proxies.get('https') or env_proxies.get('http')
if env_proxy_server:
if proxy:
proxy["server"] = env_proxy_server
proxy['server'] = env_proxy_server
else:
proxy = {"server": env_proxy_server}
proxy = {'server': env_proxy_server}
# Store parameters for creating TavilyLoader instances
self.web_paths = web_paths if isinstance(web_paths, list) else [web_paths]
@@ -376,14 +370,14 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
self._safe_process_url_sync(url)
valid_urls.append(url)
except Exception as e:
log.warning(f"SSL verification failed for {url}: {str(e)}")
log.warning(f'SSL verification failed for {url}: {str(e)}')
if not self.continue_on_failure:
raise e
if not valid_urls:
if self.continue_on_failure:
log.warning("No valid URLs to process after SSL verification")
log.warning('No valid URLs to process after SSL verification')
return
raise ValueError("No valid URLs to process after SSL verification")
raise ValueError('No valid URLs to process after SSL verification')
try:
loader = TavilyLoader(
urls=valid_urls,
@@ -394,7 +388,7 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
yield from loader.lazy_load()
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error extracting content from URLs: {e}")
log.exception(f'Error extracting content from URLs: {e}')
else:
raise e
@@ -406,15 +400,15 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
await self._safe_process_url(url)
valid_urls.append(url)
except Exception as e:
log.warning(f"SSL verification failed for {url}: {str(e)}")
log.warning(f'SSL verification failed for {url}: {str(e)}')
if not self.continue_on_failure:
raise e
if not valid_urls:
if self.continue_on_failure:
log.warning("No valid URLs to process after SSL verification")
log.warning('No valid URLs to process after SSL verification')
return
raise ValueError("No valid URLs to process after SSL verification")
raise ValueError('No valid URLs to process after SSL verification')
try:
loader = TavilyLoader(
@@ -427,7 +421,7 @@ class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
yield document
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error loading URLs: {e}")
log.exception(f'Error loading URLs: {e}')
else:
raise e
@@ -462,15 +456,15 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
):
"""Initialize with additional safety parameters and remote browser support."""
proxy_server = proxy.get("server") if proxy else None
proxy_server = proxy.get('server') if proxy else None
if trust_env and not proxy_server:
env_proxies = urllib.request.getproxies()
env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
env_proxy_server = env_proxies.get('https') or env_proxies.get('http')
if env_proxy_server:
if proxy:
proxy["server"] = env_proxy_server
proxy['server'] = env_proxy_server
else:
proxy = {"server": env_proxy_server}
proxy = {'server': env_proxy_server}
# We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
super().__init__(
@@ -504,14 +498,14 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
page = browser.new_page()
response = page.goto(url, timeout=self.playwright_timeout)
if response is None:
raise ValueError(f"page.goto() returned None for url {url}")
raise ValueError(f'page.goto() returned None for url {url}')
text = self.evaluator.evaluate(page, browser, response)
metadata = {"source": url}
metadata = {'source': url}
yield Document(page_content=text, metadata=metadata)
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error loading {url}: {e}")
log.exception(f'Error loading {url}: {e}')
continue
raise e
browser.close()
@@ -525,9 +519,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
if self.playwright_ws_url:
browser = await p.chromium.connect(self.playwright_ws_url)
else:
browser = await p.chromium.launch(
headless=self.headless, proxy=self.proxy
)
browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
for url in self.urls:
try:
@@ -535,14 +527,14 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
page = await browser.new_page()
response = await page.goto(url, timeout=self.playwright_timeout)
if response is None:
raise ValueError(f"page.goto() returned None for url {url}")
raise ValueError(f'page.goto() returned None for url {url}')
text = await self.evaluator.evaluate_async(page, browser, response)
metadata = {"source": url}
metadata = {'source': url}
yield Document(page_content=text, metadata=metadata)
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error loading {url}: {e}")
log.exception(f'Error loading {url}: {e}')
continue
raise e
await browser.close()
@@ -560,9 +552,7 @@ class SafeWebBaseLoader(WebBaseLoader):
super().__init__(*args, **kwargs)
self.trust_env = trust_env
async def _fetch(
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
) -> str:
async def _fetch(self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5) -> str:
async with aiohttp.ClientSession(trust_env=self.trust_env) as session:
for i in range(retries):
try:
@@ -571,7 +561,7 @@ class SafeWebBaseLoader(WebBaseLoader):
cookies=self.session.cookies.get_dict(),
)
if not self.session.verify:
kwargs["ssl"] = False
kwargs['ssl'] = False
async with session.get(
url,
@@ -585,16 +575,11 @@ class SafeWebBaseLoader(WebBaseLoader):
if i == retries - 1:
raise
else:
log.warning(
f"Error fetching {url} with attempt "
f"{i + 1}/{retries}: {e}. Retrying..."
)
log.warning(f'Error fetching {url} with attempt {i + 1}/{retries}: {e}. Retrying...')
await asyncio.sleep(cooldown * backoff**i)
raise ValueError("retry count exceeded")
raise ValueError('retry count exceeded')
def _unpack_fetch_results(
self, results: Any, urls: List[str], parser: Union[str, None] = None
) -> List[Any]:
def _unpack_fetch_results(self, results: Any, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
"""Unpack fetch results into BeautifulSoup objects."""
from bs4 import BeautifulSoup
@@ -602,17 +587,15 @@ class SafeWebBaseLoader(WebBaseLoader):
for i, result in enumerate(results):
url = urls[i]
if parser is None:
if url.endswith(".xml"):
parser = "xml"
if url.endswith('.xml'):
parser = 'xml'
else:
parser = self.default_parser
self._check_parser(parser)
final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
return final_results
async def ascrape_all(
self, urls: List[str], parser: Union[str, None] = None
) -> List[Any]:
async def ascrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
"""Async fetch all urls, then return soups for all results."""
results = await self.fetch_all(urls)
return self._unpack_fetch_results(results, urls, parser=parser)
@@ -630,22 +613,20 @@ class SafeWebBaseLoader(WebBaseLoader):
yield Document(page_content=text, metadata=metadata)
except Exception as e:
# Log the error and continue with the next URL
log.exception(f"Error loading {path}: {e}")
log.exception(f'Error loading {path}: {e}')
async def alazy_load(self) -> AsyncIterator[Document]:
"""Async lazy load text from the url(s) in web_path."""
results = await self.ascrape_all(self.web_paths)
for path, soup in zip(self.web_paths, results):
text = soup.get_text(**self.bs_get_text_kwargs)
metadata = {"source": path}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get(
"content", "No description found."
)
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
metadata = {'source': path}
if title := soup.find('title'):
metadata['title'] = title.get_text()
if description := soup.find('meta', attrs={'name': 'description'}):
metadata['description'] = description.get('content', 'No description found.')
if html := soup.find('html'):
metadata['language'] = html.get('lang', 'No language found.')
yield Document(page_content=text, metadata=metadata)
async def aload(self) -> list[Document]:
@@ -663,18 +644,18 @@ def get_web_loader(
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
if not safe_urls:
log.warning(f"All provided URLs were blocked or invalid: {urls}")
log.warning(f'All provided URLs were blocked or invalid: {urls}')
raise ValueError(ERROR_MESSAGES.INVALID_URL)
web_loader_args = {
"web_paths": safe_urls,
"verify_ssl": verify_ssl,
"requests_per_second": requests_per_second,
"continue_on_failure": True,
"trust_env": trust_env,
'web_paths': safe_urls,
'verify_ssl': verify_ssl,
'requests_per_second': requests_per_second,
'continue_on_failure': True,
'trust_env': trust_env,
}
if WEB_LOADER_ENGINE.value == "" or WEB_LOADER_ENGINE.value == "safe_web":
if WEB_LOADER_ENGINE.value == '' or WEB_LOADER_ENGINE.value == 'safe_web':
WebLoaderClass = SafeWebBaseLoader
request_kwargs = {}
@@ -685,42 +666,42 @@ def get_web_loader(
timeout_value = None
if timeout_value:
request_kwargs["timeout"] = timeout_value
request_kwargs['timeout'] = timeout_value
if request_kwargs:
web_loader_args["requests_kwargs"] = request_kwargs
web_loader_args['requests_kwargs'] = request_kwargs
if WEB_LOADER_ENGINE.value == "playwright":
if WEB_LOADER_ENGINE.value == 'playwright':
WebLoaderClass = SafePlaywrightURLLoader
web_loader_args["playwright_timeout"] = PLAYWRIGHT_TIMEOUT.value
web_loader_args['playwright_timeout'] = PLAYWRIGHT_TIMEOUT.value
if PLAYWRIGHT_WS_URL.value:
web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URL.value
web_loader_args['playwright_ws_url'] = PLAYWRIGHT_WS_URL.value
if WEB_LOADER_ENGINE.value == "firecrawl":
if WEB_LOADER_ENGINE.value == 'firecrawl':
WebLoaderClass = SafeFireCrawlLoader
web_loader_args["api_key"] = FIRECRAWL_API_KEY.value
web_loader_args["api_url"] = FIRECRAWL_API_BASE_URL.value
web_loader_args['api_key'] = FIRECRAWL_API_KEY.value
web_loader_args['api_url'] = FIRECRAWL_API_BASE_URL.value
if FIRECRAWL_TIMEOUT.value:
try:
web_loader_args["timeout"] = int(FIRECRAWL_TIMEOUT.value)
web_loader_args['timeout'] = int(FIRECRAWL_TIMEOUT.value)
except ValueError:
pass
if WEB_LOADER_ENGINE.value == "tavily":
if WEB_LOADER_ENGINE.value == 'tavily':
WebLoaderClass = SafeTavilyLoader
web_loader_args["api_key"] = TAVILY_API_KEY.value
web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value
web_loader_args['api_key'] = TAVILY_API_KEY.value
web_loader_args['extract_depth'] = TAVILY_EXTRACT_DEPTH.value
if WEB_LOADER_ENGINE.value == "external":
if WEB_LOADER_ENGINE.value == 'external':
WebLoaderClass = ExternalWebLoader
web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value
web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value
web_loader_args['external_url'] = EXTERNAL_WEB_LOADER_URL.value
web_loader_args['external_api_key'] = EXTERNAL_WEB_LOADER_API_KEY.value
if WebLoaderClass:
web_loader = WebLoaderClass(**web_loader_args)
log.debug(
"Using WEB_LOADER_ENGINE %s for %s URLs",
'Using WEB_LOADER_ENGINE %s for %s URLs',
web_loader.__class__.__name__,
len(safe_urls),
)
@@ -728,6 +709,6 @@ def get_web_loader(
return web_loader
else:
raise ValueError(
f"Invalid WEB_LOADER_ENGINE: {WEB_LOADER_ENGINE.value}. "
f'Invalid WEB_LOADER_ENGINE: {WEB_LOADER_ENGINE.value}. '
"Please set it to 'safe_web', 'playwright', 'firecrawl', or 'tavily'."
)