This commit is contained in:
Timothy Jaeryang Baek
2026-03-17 17:58:01 -05:00
parent fcf7208352
commit de3317e26b
220 changed files with 17200 additions and 22836 deletions

View File

@@ -22,37 +22,35 @@ class MinerULoader:
def __init__(
self,
file_path: str,
api_mode: str = "local",
api_url: str = "http://localhost:8000",
api_key: str = "",
api_mode: str = 'local',
api_url: str = 'http://localhost:8000',
api_key: str = '',
params: dict = None,
timeout: Optional[int] = 300,
):
self.file_path = file_path
self.api_mode = api_mode.lower()
self.api_url = api_url.rstrip("/")
self.api_url = api_url.rstrip('/')
self.api_key = api_key
self.timeout = timeout
# Parse params dict with defaults
self.params = params or {}
self.enable_ocr = params.get("enable_ocr", False)
self.enable_formula = params.get("enable_formula", True)
self.enable_table = params.get("enable_table", True)
self.language = params.get("language", "en")
self.model_version = params.get("model_version", "pipeline")
self.enable_ocr = params.get('enable_ocr', False)
self.enable_formula = params.get('enable_formula', True)
self.enable_table = params.get('enable_table', True)
self.language = params.get('language', 'en')
self.model_version = params.get('model_version', 'pipeline')
self.page_ranges = self.params.pop("page_ranges", "")
self.page_ranges = self.params.pop('page_ranges', '')
# Validate API mode
if self.api_mode not in ["local", "cloud"]:
raise ValueError(
f"Invalid API mode: {self.api_mode}. Must be 'local' or 'cloud'"
)
if self.api_mode not in ['local', 'cloud']:
raise ValueError(f"Invalid API mode: {self.api_mode}. Must be 'local' or 'cloud'")
# Validate Cloud API requirements
if self.api_mode == "cloud" and not self.api_key:
raise ValueError("API key is required for Cloud API mode")
if self.api_mode == 'cloud' and not self.api_key:
raise ValueError('API key is required for Cloud API mode')
def load(self) -> List[Document]:
"""
@@ -60,12 +58,12 @@ class MinerULoader:
Routes to Cloud or Local API based on api_mode.
"""
try:
if self.api_mode == "cloud":
if self.api_mode == 'cloud':
return self._load_cloud_api()
else:
return self._load_local_api()
except Exception as e:
log.error(f"Error loading document with MinerU: {e}")
log.error(f'Error loading document with MinerU: {e}')
raise
def _load_local_api(self) -> List[Document]:
@@ -73,14 +71,14 @@ class MinerULoader:
Load document using Local API (synchronous).
Posts file to /file_parse endpoint and gets immediate response.
"""
log.info(f"Using MinerU Local API at {self.api_url}")
log.info(f'Using MinerU Local API at {self.api_url}')
filename = os.path.basename(self.file_path)
# Build form data for Local API
form_data = {
**self.params,
"return_md": "true",
'return_md': 'true',
}
# Page ranges (Local API uses start_page_id and end_page_id)
@@ -89,18 +87,18 @@ class MinerULoader:
# Full page range parsing would require parsing the string
log.warning(
f"Page ranges '{self.page_ranges}' specified but Local API uses different format. "
"Consider using start_page_id/end_page_id parameters if needed."
'Consider using start_page_id/end_page_id parameters if needed.'
)
try:
with open(self.file_path, "rb") as f:
files = {"files": (filename, f, "application/octet-stream")}
with open(self.file_path, 'rb') as f:
files = {'files': (filename, f, 'application/octet-stream')}
log.info(f"Sending file to MinerU Local API: {filename}")
log.debug(f"Local API parameters: {form_data}")
log.info(f'Sending file to MinerU Local API: {filename}')
log.debug(f'Local API parameters: {form_data}')
response = requests.post(
f"{self.api_url}/file_parse",
f'{self.api_url}/file_parse',
data=form_data,
files=files,
timeout=self.timeout,
@@ -108,27 +106,25 @@ class MinerULoader:
response.raise_for_status()
except FileNotFoundError:
raise HTTPException(
status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}"
)
raise HTTPException(status.HTTP_404_NOT_FOUND, detail=f'File not found: {self.file_path}')
except requests.Timeout:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT,
detail="MinerU Local API request timed out",
detail='MinerU Local API request timed out',
)
except requests.HTTPError as e:
error_detail = f"MinerU Local API request failed: {e}"
error_detail = f'MinerU Local API request failed: {e}'
if e.response is not None:
try:
error_data = e.response.json()
error_detail += f" - {error_data}"
error_detail += f' - {error_data}'
except Exception:
error_detail += f" - {e.response.text}"
error_detail += f' - {e.response.text}'
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error calling MinerU Local API: {str(e)}",
detail=f'Error calling MinerU Local API: {str(e)}',
)
# Parse response
@@ -137,41 +133,41 @@ class MinerULoader:
except ValueError as e:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Invalid JSON response from MinerU Local API: {e}",
detail=f'Invalid JSON response from MinerU Local API: {e}',
)
# Extract markdown content from response
if "results" not in result:
if 'results' not in result:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail="MinerU Local API response missing 'results' field",
)
results = result["results"]
results = result['results']
if not results:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail="MinerU returned empty results",
detail='MinerU returned empty results',
)
# Get the first (and typically only) result
file_result = list(results.values())[0]
markdown_content = file_result.get("md_content", "")
markdown_content = file_result.get('md_content', '')
if not markdown_content:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail="MinerU returned empty markdown content",
detail='MinerU returned empty markdown content',
)
log.info(f"Successfully parsed document with MinerU Local API: {filename}")
log.info(f'Successfully parsed document with MinerU Local API: {filename}')
# Create metadata
metadata = {
"source": filename,
"api_mode": "local",
"backend": result.get("backend", "unknown"),
"version": result.get("version", "unknown"),
'source': filename,
'api_mode': 'local',
'backend': result.get('backend', 'unknown'),
'version': result.get('version', 'unknown'),
}
return [Document(page_content=markdown_content, metadata=metadata)]
@@ -181,7 +177,7 @@ class MinerULoader:
Load document using Cloud API (asynchronous).
Uses batch upload endpoint to avoid need for public file URLs.
"""
log.info(f"Using MinerU Cloud API at {self.api_url}")
log.info(f'Using MinerU Cloud API at {self.api_url}')
filename = os.path.basename(self.file_path)
@@ -195,17 +191,15 @@ class MinerULoader:
result = self._poll_batch_status(batch_id, filename)
# Step 4: Download and extract markdown from ZIP
markdown_content = self._download_and_extract_zip(
result["full_zip_url"], filename
)
markdown_content = self._download_and_extract_zip(result['full_zip_url'], filename)
log.info(f"Successfully parsed document with MinerU Cloud API: {filename}")
log.info(f'Successfully parsed document with MinerU Cloud API: {filename}')
# Create metadata
metadata = {
"source": filename,
"api_mode": "cloud",
"batch_id": batch_id,
'source': filename,
'api_mode': 'cloud',
'batch_id': batch_id,
}
return [Document(page_content=markdown_content, metadata=metadata)]
@@ -216,49 +210,49 @@ class MinerULoader:
Returns (batch_id, upload_url).
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json',
}
# Build request body
request_body = {
**self.params,
"files": [
'files': [
{
"name": filename,
"is_ocr": self.enable_ocr,
'name': filename,
'is_ocr': self.enable_ocr,
}
],
}
# Add page ranges if specified
if self.page_ranges:
request_body["files"][0]["page_ranges"] = self.page_ranges
request_body['files'][0]['page_ranges'] = self.page_ranges
log.info(f"Requesting upload URL for: {filename}")
log.debug(f"Cloud API request body: {request_body}")
log.info(f'Requesting upload URL for: {filename}')
log.debug(f'Cloud API request body: {request_body}')
try:
response = requests.post(
f"{self.api_url}/file-urls/batch",
f'{self.api_url}/file-urls/batch',
headers=headers,
json=request_body,
timeout=30,
)
response.raise_for_status()
except requests.HTTPError as e:
error_detail = f"Failed to request upload URL: {e}"
error_detail = f'Failed to request upload URL: {e}'
if e.response is not None:
try:
error_data = e.response.json()
error_detail += f" - {error_data.get('msg', error_data)}"
error_detail += f' - {error_data.get("msg", error_data)}'
except Exception:
error_detail += f" - {e.response.text}"
error_detail += f' - {e.response.text}'
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error requesting upload URL: {str(e)}",
detail=f'Error requesting upload URL: {str(e)}',
)
try:
@@ -266,28 +260,28 @@ class MinerULoader:
except ValueError as e:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Invalid JSON response: {e}",
detail=f'Invalid JSON response: {e}',
)
# Check for API error response
if result.get("code") != 0:
if result.get('code') != 0:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"MinerU Cloud API error: {result.get('msg', 'Unknown error')}",
detail=f'MinerU Cloud API error: {result.get("msg", "Unknown error")}',
)
data = result.get("data", {})
batch_id = data.get("batch_id")
file_urls = data.get("file_urls", [])
data = result.get('data', {})
batch_id = data.get('batch_id')
file_urls = data.get('file_urls', [])
if not batch_id or not file_urls:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail="MinerU Cloud API response missing batch_id or file_urls",
detail='MinerU Cloud API response missing batch_id or file_urls',
)
upload_url = file_urls[0]
log.info(f"Received upload URL for batch: {batch_id}")
log.info(f'Received upload URL for batch: {batch_id}')
return batch_id, upload_url
@@ -295,10 +289,10 @@ class MinerULoader:
"""
Upload file to presigned URL (no authentication needed).
"""
log.info(f"Uploading file to presigned URL")
log.info(f'Uploading file to presigned URL')
try:
with open(self.file_path, "rb") as f:
with open(self.file_path, 'rb') as f:
response = requests.put(
upload_url,
data=f,
@@ -306,26 +300,24 @@ class MinerULoader:
)
response.raise_for_status()
except FileNotFoundError:
raise HTTPException(
status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}"
)
raise HTTPException(status.HTTP_404_NOT_FOUND, detail=f'File not found: {self.file_path}')
except requests.Timeout:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT,
detail="File upload to presigned URL timed out",
detail='File upload to presigned URL timed out',
)
except requests.HTTPError as e:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Failed to upload file to presigned URL: {e}",
detail=f'Failed to upload file to presigned URL: {e}',
)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error uploading file: {str(e)}",
detail=f'Error uploading file: {str(e)}',
)
log.info("File uploaded successfully")
log.info('File uploaded successfully')
def _poll_batch_status(self, batch_id: str, filename: str) -> dict:
"""
@@ -333,35 +325,35 @@ class MinerULoader:
Returns the result dict for the file.
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
'Authorization': f'Bearer {self.api_key}',
}
max_iterations = 300 # 10 minutes max (2 seconds per iteration)
poll_interval = 2 # seconds
log.info(f"Polling batch status: {batch_id}")
log.info(f'Polling batch status: {batch_id}')
for iteration in range(max_iterations):
try:
response = requests.get(
f"{self.api_url}/extract-results/batch/{batch_id}",
f'{self.api_url}/extract-results/batch/{batch_id}',
headers=headers,
timeout=30,
)
response.raise_for_status()
except requests.HTTPError as e:
error_detail = f"Failed to poll batch status: {e}"
error_detail = f'Failed to poll batch status: {e}'
if e.response is not None:
try:
error_data = e.response.json()
error_detail += f" - {error_data.get('msg', error_data)}"
error_detail += f' - {error_data.get("msg", error_data)}'
except Exception:
error_detail += f" - {e.response.text}"
error_detail += f' - {e.response.text}'
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error polling batch status: {str(e)}",
detail=f'Error polling batch status: {str(e)}',
)
try:
@@ -369,58 +361,56 @@ class MinerULoader:
except ValueError as e:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Invalid JSON response while polling: {e}",
detail=f'Invalid JSON response while polling: {e}',
)
# Check for API error response
if result.get("code") != 0:
if result.get('code') != 0:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"MinerU Cloud API error: {result.get('msg', 'Unknown error')}",
detail=f'MinerU Cloud API error: {result.get("msg", "Unknown error")}',
)
data = result.get("data", {})
extract_result = data.get("extract_result", [])
data = result.get('data', {})
extract_result = data.get('extract_result', [])
# Find our file in the batch results
file_result = None
for item in extract_result:
if item.get("file_name") == filename:
if item.get('file_name') == filename:
file_result = item
break
if not file_result:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"File {filename} not found in batch results",
detail=f'File {filename} not found in batch results',
)
state = file_result.get("state")
state = file_result.get('state')
if state == "done":
log.info(f"Processing complete for {filename}")
if state == 'done':
log.info(f'Processing complete for {filename}')
return file_result
elif state == "failed":
error_msg = file_result.get("err_msg", "Unknown error")
elif state == 'failed':
error_msg = file_result.get('err_msg', 'Unknown error')
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"MinerU processing failed: {error_msg}",
detail=f'MinerU processing failed: {error_msg}',
)
elif state in ["waiting-file", "pending", "running", "converting"]:
elif state in ['waiting-file', 'pending', 'running', 'converting']:
# Still processing
if iteration % 10 == 0: # Log every 20 seconds
log.info(
f"Processing status: {state} (iteration {iteration + 1}/{max_iterations})"
)
log.info(f'Processing status: {state} (iteration {iteration + 1}/{max_iterations})')
time.sleep(poll_interval)
else:
log.warning(f"Unknown state: {state}")
log.warning(f'Unknown state: {state}')
time.sleep(poll_interval)
# Timeout
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT,
detail="MinerU processing timed out after 10 minutes",
detail='MinerU processing timed out after 10 minutes',
)
def _download_and_extract_zip(self, zip_url: str, filename: str) -> str:
@@ -428,7 +418,7 @@ class MinerULoader:
Download ZIP file from CDN and extract markdown content.
Returns the markdown content as a string.
"""
log.info(f"Downloading results from: {zip_url}")
log.info(f'Downloading results from: {zip_url}')
try:
response = requests.get(zip_url, timeout=60)
@@ -436,23 +426,23 @@ class MinerULoader:
except requests.HTTPError as e:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Failed to download results ZIP: {e}",
detail=f'Failed to download results ZIP: {e}',
)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error downloading results: {str(e)}",
detail=f'Error downloading results: {str(e)}',
)
# Save ZIP to temporary file and extract
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip:
with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp_zip:
tmp_zip.write(response.content)
tmp_zip_path = tmp_zip.name
with tempfile.TemporaryDirectory() as tmp_dir:
# Extract ZIP
with zipfile.ZipFile(tmp_zip_path, "r") as zip_ref:
with zipfile.ZipFile(tmp_zip_path, 'r') as zip_ref:
zip_ref.extractall(tmp_dir)
# Find markdown file - search recursively for any .md file
@@ -466,33 +456,27 @@ class MinerULoader:
full_path = os.path.join(root, file)
all_files.append(full_path)
# Look for any .md file
if file.endswith(".md"):
if file.endswith('.md'):
found_md_path = full_path
log.info(f"Found markdown file at: {full_path}")
log.info(f'Found markdown file at: {full_path}')
try:
with open(full_path, "r", encoding="utf-8") as f:
with open(full_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()
if (
markdown_content
): # Use the first non-empty markdown file
if markdown_content: # Use the first non-empty markdown file
break
except Exception as e:
log.warning(f"Failed to read {full_path}: {e}")
log.warning(f'Failed to read {full_path}: {e}')
if markdown_content:
break
if markdown_content is None:
log.error(f"Available files in ZIP: {all_files}")
log.error(f'Available files in ZIP: {all_files}')
# Try to provide more helpful error message
md_files = [f for f in all_files if f.endswith(".md")]
md_files = [f for f in all_files if f.endswith('.md')]
if md_files:
error_msg = (
f"Found .md files but couldn't read them: {md_files}"
)
error_msg = f"Found .md files but couldn't read them: {md_files}"
else:
error_msg = (
f"No .md files found in ZIP. Available files: {all_files}"
)
error_msg = f'No .md files found in ZIP. Available files: {all_files}'
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=error_msg,
@@ -504,21 +488,19 @@ class MinerULoader:
except zipfile.BadZipFile as e:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Invalid ZIP file received: {e}",
detail=f'Invalid ZIP file received: {e}',
)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error extracting ZIP: {str(e)}",
detail=f'Error extracting ZIP: {str(e)}',
)
if not markdown_content:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail="Extracted markdown content is empty",
detail='Extracted markdown content is empty',
)
log.info(
f"Successfully extracted markdown content ({len(markdown_content)} characters)"
)
log.info(f'Successfully extracted markdown content ({len(markdown_content)} characters)')
return markdown_content