138 lines
4.6 KiB
Python
138 lines
4.6 KiB
Python
|
|
"""文档解析:PDF / DOCX / TXT / MD / CSV / XLSX"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import io
|
|||
|
|
import base64
|
|||
|
|
import httpx
|
|||
|
|
from config import SILICONFLOW_API_KEY, SILICONFLOW_BASE_URL
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def download_file(url: str, local_path: str) -> str:
|
|||
|
|
"""从 COS 预签名 URL 下载文件到本地"""
|
|||
|
|
async with httpx.AsyncClient(timeout=120, follow_redirects=True) as client:
|
|||
|
|
resp = await client.get(url)
|
|||
|
|
resp.raise_for_status()
|
|||
|
|
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|||
|
|
with open(local_path, "wb") as f:
|
|||
|
|
f.write(resp.content)
|
|||
|
|
return local_path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_txt(file_path: str) -> str:
|
|||
|
|
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
|
|||
|
|
return f.read()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_markdown(file_path: str) -> str:
|
|||
|
|
return parse_txt(file_path)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_docx(file_path: str) -> str:
|
|||
|
|
from docx import Document
|
|||
|
|
doc = Document(file_path)
|
|||
|
|
return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_pdf_text(file_path: str) -> str:
|
|||
|
|
"""用 PyMuPDF 提取 PDF 文本层"""
|
|||
|
|
import fitz
|
|||
|
|
doc = fitz.open(file_path)
|
|||
|
|
pages = []
|
|||
|
|
for page in doc:
|
|||
|
|
text = page.get_text()
|
|||
|
|
if text.strip():
|
|||
|
|
pages.append(text)
|
|||
|
|
doc.close()
|
|||
|
|
return "\n\n".join(pages)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def pdf_needs_ocr(file_path: str) -> bool:
|
|||
|
|
"""判断 PDF 是否需要 OCR(文本层为空或极少文字)"""
|
|||
|
|
import fitz
|
|||
|
|
doc = fitz.open(file_path)
|
|||
|
|
total_len = sum(len(page.get_text().strip()) for page in doc)
|
|||
|
|
doc.close()
|
|||
|
|
# 平均每页少于 50 字符 → 扫描件
|
|||
|
|
page_count = max(doc.page_count if hasattr(doc, 'page_count') else len(doc), 1)
|
|||
|
|
return (total_len / page_count) < 50
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def ocr_with_siliconflow(image_bytes: bytes) -> str:
|
|||
|
|
"""用硅基流动多模态模型做 OCR / 图文识别"""
|
|||
|
|
b64 = base64.b64encode(image_bytes).decode()
|
|||
|
|
async with httpx.AsyncClient(timeout=60) as client:
|
|||
|
|
resp = await client.post(
|
|||
|
|
f"{SILICONFLOW_BASE_URL}/chat/completions",
|
|||
|
|
headers={"Authorization": f"Bearer {SILICONFLOW_API_KEY}"},
|
|||
|
|
json={
|
|||
|
|
"model": "Qwen/Qwen3-VL-32B-Instruct",
|
|||
|
|
"messages": [{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": [
|
|||
|
|
{"type": "text", "text": "请识别并提取这张图片中的所有文字内容。如果有表格,请用 Markdown 表格格式输出。不要添加任何解释。"},
|
|||
|
|
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}},
|
|||
|
|
],
|
|||
|
|
}],
|
|||
|
|
"max_tokens": 4096,
|
|||
|
|
},
|
|||
|
|
)
|
|||
|
|
data = resp.json()
|
|||
|
|
return data["choices"][0]["message"]["content"]
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def parse_image_with_ocr(file_path: str) -> str:
|
|||
|
|
"""对图片进行 OCR"""
|
|||
|
|
with open(file_path, "rb") as f:
|
|||
|
|
image_bytes = f.read()
|
|||
|
|
return await ocr_with_siliconflow(image_bytes)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_csv(file_path: str) -> str:
|
|||
|
|
import pandas as pd
|
|||
|
|
df = pd.read_csv(file_path)
|
|||
|
|
return df.to_markdown(index=False)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_xlsx(file_path: str) -> str:
|
|||
|
|
import pandas as pd
|
|||
|
|
df = pd.read_excel(file_path)
|
|||
|
|
return df.to_markdown(index=False)
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def parse_document(file_path: str, mime_type: str) -> str:
|
|||
|
|
"""根据文件类型路由到合适的解析器"""
|
|||
|
|
ext = os.path.splitext(file_path)[1].lower()
|
|||
|
|
|
|||
|
|
if ext in (".txt",):
|
|||
|
|
return parse_txt(file_path)
|
|||
|
|
elif ext in (".md", ".markdown"):
|
|||
|
|
return parse_markdown(file_path)
|
|||
|
|
elif ext in (".docx",):
|
|||
|
|
return parse_docx(file_path)
|
|||
|
|
elif ext in (".csv",):
|
|||
|
|
return parse_csv(file_path)
|
|||
|
|
elif ext in (".xlsx",):
|
|||
|
|
return parse_xlsx(file_path)
|
|||
|
|
elif ext in (".pdf",):
|
|||
|
|
if pdf_needs_ocr(file_path):
|
|||
|
|
# 扫描件——先尝试文本提取,空则走多模态
|
|||
|
|
text = parse_pdf_text(file_path)
|
|||
|
|
if len(text.strip()) < 100:
|
|||
|
|
# 全扫描件,逐页 OCR
|
|||
|
|
import fitz
|
|||
|
|
doc = fitz.open(file_path)
|
|||
|
|
results = []
|
|||
|
|
for i, page in enumerate(doc):
|
|||
|
|
pix = page.get_pixmap(dpi=150)
|
|||
|
|
img_bytes = pix.tobytes("png")
|
|||
|
|
page_text = await ocr_with_siliconflow(img_bytes)
|
|||
|
|
results.append(page_text)
|
|||
|
|
doc.close()
|
|||
|
|
return "\n\n".join(results)
|
|||
|
|
return text
|
|||
|
|
return parse_pdf_text(file_path)
|
|||
|
|
elif ext in (".png", ".jpg", ".jpeg", ".webp", ".heic", ".bmp"):
|
|||
|
|
return await parse_image_with_ocr(file_path)
|
|||
|
|
else:
|
|||
|
|
raise ValueError(f"不支持的文件类型: {ext}")
|