mirror of
https://github.com/open-webui/open-webui.git
synced 2026-05-07 11:28:35 -05:00
fix: revert conditional skip that caused no_docs regression
The conditional skip optimization broke vector search because query_doc_with_hybrid_search requires collection_result for validation even when BM25 is disabled.\n\nChanges:\n- Always fetch collection data (required for validation)\n- Keep sync fetch optimization for single collections\n- Simplify task loop filter
This commit is contained in:
@@ -544,28 +544,20 @@ async def query_collection_with_hybrid_search(
|
||||
results = []
|
||||
error = False
|
||||
|
||||
# Optimization: Only fetch collection data if explicitly needed for BM25 or enrichment.
|
||||
# Fetching massive collections (e.g. 160k docs) takes significant time (~30s) and memory.
|
||||
should_fetch_data = hybrid_bm25_weight > 0 or enable_enriched_texts
|
||||
|
||||
if should_fetch_data:
|
||||
# Optimization: If only one collection, fetch synchronously to avoid asyncio.to_thread overhead
|
||||
# which can cause significant regression (e.g. 4s vs 28s) for large payloads.
|
||||
if len(collection_names) == 1:
|
||||
try:
|
||||
col_name = collection_names[0]
|
||||
log.debug(f"Fetching single collection synchronously: {col_name}")
|
||||
data = VECTOR_DB_CLIENT.get(collection_name=col_name)
|
||||
collection_results = {col_name: data}
|
||||
except Exception as e:
|
||||
log.exception(f"Failed to fetch collection {col_name}: {e}")
|
||||
collection_results = {col_name: None}
|
||||
else:
|
||||
# Fetch all collections in parallel to reduce latency for multiple collections
|
||||
collection_results = await fetch_collection_data(collection_names)
|
||||
# Always fetch collection data - it's required for validation in query_doc_with_hybrid_search.
|
||||
# Optimization: Use sync fetch for single collection to avoid asyncio overhead.
|
||||
if len(collection_names) == 1:
|
||||
try:
|
||||
col_name = collection_names[0]
|
||||
log.debug(f"Fetching single collection synchronously: {col_name}")
|
||||
data = VECTOR_DB_CLIENT.get(collection_name=col_name)
|
||||
collection_results = {col_name: data}
|
||||
except Exception as e:
|
||||
log.exception(f"Failed to fetch collection {col_name}: {e}")
|
||||
collection_results = {col_name: None}
|
||||
else:
|
||||
log.debug("Skipping collection fetch: BM25 disabled and no text enrichment needed")
|
||||
collection_results = {name: None for name in collection_names}
|
||||
# Fetch all collections in parallel to reduce latency for multiple collections
|
||||
collection_results = await fetch_collection_data(collection_names)
|
||||
|
||||
log.info(
|
||||
f"Starting hybrid search for {len(queries)} queries in {len(collection_names)} collections..."
|
||||
@@ -591,16 +583,13 @@ async def query_collection_with_hybrid_search(
|
||||
return None, e
|
||||
|
||||
# Prepare tasks for all collections and queries
|
||||
# If we skipped fetching (should_fetch_data=False), we still want to run the query (using vector search only).
|
||||
# If we tried to fetch and failed (result is None), we skip.
|
||||
tasks = []
|
||||
for collection_name in collection_names:
|
||||
data = collection_results.get(collection_name)
|
||||
if data is None and should_fetch_data:
|
||||
# We needed data but failed to fetch it, skip this collection
|
||||
continue
|
||||
for query in queries:
|
||||
tasks.append((collection_name, query))
|
||||
# Skip collections that failed to fetch (data is None)
|
||||
tasks = [
|
||||
(collection_name, query)
|
||||
for collection_name in collection_names
|
||||
if collection_results.get(collection_name) is not None
|
||||
for query in queries
|
||||
]
|
||||
|
||||
# Run all queries in parallel using asyncio.gather
|
||||
task_results = await asyncio.gather(
|
||||
|
||||
Reference in New Issue
Block a user