From 46963ab1cab525a253b8b12ece0d933419a54355 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Thu, 29 May 2025 16:17:22 +0800 Subject: [PATCH] Fix: add advanced delimiter detection for naive merge (#7941) ### What problem does this PR solve? Add advanced delimiter detection for naive merge. #7824 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) --- rag/nlp/__init__.py | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 0fe0653fc..34881d924 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -536,8 +536,13 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): cks[-1] += t tk_nums[-1] += tnum + dels = get_delimiters(delimiter) for sec, pos in sections: - add_chunk(sec, pos) + splited_sec = re.split(r"(%s)" % dels, sec) + for sub_sec in splited_sec: + if re.match(f"^{dels}$", sub_sec): + continue + add_chunk(sub_sec, pos) return cks @@ -576,8 +581,13 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 result_images[-1] = concat_img(result_images[-1], image) tk_nums[-1] += tnum + dels = get_delimiters(delimiter) for text, image in zip(texts, images): - add_chunk(text, image) + splited_sec = re.split(r"(%s)" % dels, text) + for sub_sec in splited_sec: + if re.match(f"^{dels}$", sub_sec): + continue + add_chunk(text, image) return cks, result_images @@ -640,8 +650,13 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): images[-1] = concat_img(images[-1], image) tk_nums[-1] += tnum + dels = get_delimiters(delimiter) for sec, image in sections: - add_chunk(sec, image, '') + splited_sec = re.split(r"(%s)" % dels, sec) + for sub_sec in splited_sec: + if re.match(f"^{dels}$", sub_sec): + continue + add_chunk(sub_sec, image,"") return cks, images @@ -649,3 +664,20 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): def extract_between(text: str, start_tag: str, end_tag: str) -> list[str]: pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag) return re.findall(pattern, text, flags=re.DOTALL) + + +def get_delimiters(delimiters: str): + dels = [] + s = 0 + for m in re.finditer(r"`([^`]+)`", delimiters, re.I): + f, t = m.span() + dels.append(m.group(1)) + dels.extend(list(delimiters[s: f])) + s = t + if s < len(delimiters): + dels.extend(list(delimiters[s:])) + dels = [re.escape(d) for d in dels if d] + dels = [d for d in dels if d] + dels_pattern = "|".join(dels) + + return dels_pattern