121 lines
3.7 KiB
Python
121 lines
3.7 KiB
Python
|
|
"""文本切片:递归字符分割 + 中文分句保护"""
|
|||
|
|
|
|||
|
|
import re
|
|||
|
|
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 中文分句模式
|
|||
|
|
_CN_SENT_PATTERN = re.compile(
|
|||
|
|
r"([。!?;\n]|(?<!\d)\.(?!\d)|!\?|\?!)"
|
|||
|
|
)
|
|||
|
|
# Markdown 标题
|
|||
|
|
_MD_HEADING = re.compile(r"^#{1,6}\s+", re.MULTILINE)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _split_sentences(text: str) -> list[str]:
|
|||
|
|
"""按中文标点分句,保留标点在句尾"""
|
|||
|
|
parts = _CN_SENT_PATTERN.split(text)
|
|||
|
|
sentences = []
|
|||
|
|
buf = ""
|
|||
|
|
for p in parts:
|
|||
|
|
if not p:
|
|||
|
|
continue
|
|||
|
|
buf += p
|
|||
|
|
if _CN_SENT_PATTERN.match(p):
|
|||
|
|
sentences.append(buf)
|
|||
|
|
buf = ""
|
|||
|
|
if buf.strip():
|
|||
|
|
sentences.append(buf)
|
|||
|
|
return sentences
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _split_by_heading(md_text: str) -> list[dict]:
|
|||
|
|
"""按 Markdown 标题分层切片,保留标题作为 sectionTitle"""
|
|||
|
|
lines = md_text.split("\n")
|
|||
|
|
chunks = []
|
|||
|
|
current_title = ""
|
|||
|
|
current_text = ""
|
|||
|
|
|
|||
|
|
for line in lines:
|
|||
|
|
m = _MD_HEADING.match(line)
|
|||
|
|
if m:
|
|||
|
|
# 保存前一段
|
|||
|
|
if current_text.strip():
|
|||
|
|
chunks.append({"sectionTitle": current_title, "text": current_text.strip()})
|
|||
|
|
current_title = line.strip()
|
|||
|
|
current_text = ""
|
|||
|
|
else:
|
|||
|
|
current_text += line + "\n"
|
|||
|
|
|
|||
|
|
if current_text.strip():
|
|||
|
|
chunks.append({"sectionTitle": current_title, "text": current_text.strip()})
|
|||
|
|
|
|||
|
|
return chunks if chunks else [{"sectionTitle": "", "text": md_text}]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _estimate_tokens(text: str) -> int:
|
|||
|
|
"""粗略估算 token 数量(中文按字符数,英文按词数)"""
|
|||
|
|
cn_chars = len(re.findall(r"[一-鿿]", text))
|
|||
|
|
en_words = len(re.findall(r"[a-zA-Z]+", text))
|
|||
|
|
# 中文约 1.5 字符/token,英文约 1 词/token
|
|||
|
|
return int(cn_chars / 1.5) + en_words
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _chunk_text(text: str, section_title: str = "", page_number: int | None = None) -> list[dict]:
|
|||
|
|
"""递归分割 + 重叠切块"""
|
|||
|
|
sentences = _split_sentences(text)
|
|||
|
|
chunks = []
|
|||
|
|
buf = ""
|
|||
|
|
buf_tokens = 0
|
|||
|
|
|
|||
|
|
for s in sentences:
|
|||
|
|
s_tokens = _estimate_tokens(s)
|
|||
|
|
if buf_tokens + s_tokens > CHUNK_SIZE and buf_tokens > 0:
|
|||
|
|
chunks.append({"content": buf.strip(), "sectionTitle": section_title, "pageNumber": page_number})
|
|||
|
|
# 重叠:保留最后 overlap tokens
|
|||
|
|
if CHUNK_OVERLAP > 0:
|
|||
|
|
overlap_text = buf[-int(CHUNK_OVERLAP * 2):] # 粗略估算
|
|||
|
|
buf = overlap_text + s
|
|||
|
|
buf_tokens = _estimate_tokens(overlap_text) + s_tokens
|
|||
|
|
else:
|
|||
|
|
buf = s
|
|||
|
|
buf_tokens = s_tokens
|
|||
|
|
else:
|
|||
|
|
buf += s
|
|||
|
|
buf_tokens += s_tokens
|
|||
|
|
|
|||
|
|
if buf.strip():
|
|||
|
|
chunks.append({"content": buf.strip(), "sectionTitle": section_title, "pageNumber": page_number})
|
|||
|
|
|
|||
|
|
return chunks
|
|||
|
|
|
|||
|
|
|
|||
|
|
def chunk_document(text: str, source_type: str = "text") -> list[dict]:
|
|||
|
|
"""
|
|||
|
|
对文档进行切片,返回 chunk 列表。
|
|||
|
|
每个 chunk: {content, sectionTitle, pageNumber, chunkType}
|
|||
|
|
"""
|
|||
|
|
if source_type in ("md", "markdown"):
|
|||
|
|
sections = _split_by_heading(text)
|
|||
|
|
else:
|
|||
|
|
sections = [{"sectionTitle": "", "text": text}]
|
|||
|
|
|
|||
|
|
all_chunks = []
|
|||
|
|
for sec in sections:
|
|||
|
|
sec_chunks = _chunk_text(sec["text"], section_title=sec.get("sectionTitle", ""))
|
|||
|
|
all_chunks.extend(sec_chunks)
|
|||
|
|
|
|||
|
|
# 添加 chunkType
|
|||
|
|
for i, c in enumerate(all_chunks):
|
|||
|
|
c["chunkIndex"] = i
|
|||
|
|
# 检测表格/代码块
|
|||
|
|
content = c["content"]
|
|||
|
|
if content.count("|") > 5 and "---" in content:
|
|||
|
|
c["chunkType"] = "table"
|
|||
|
|
elif content.strip().startswith("```") or "```" in content:
|
|||
|
|
c["chunkType"] = "code"
|
|||
|
|
else:
|
|||
|
|
c["chunkType"] = "text"
|
|||
|
|
|
|||
|
|
return all_chunks
|