mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-10 16:59:01 +08:00
Fix: add advanced delimiter detection for naive merge (#7941)
### What problem does this PR solve? Add advanced delimiter detection for naive merge. #7824 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
6ba5a4348a
commit
46963ab1ca
@ -536,8 +536,13 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
cks[-1] += t
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
dels = get_delimiters(delimiter)
|
||||
for sec, pos in sections:
|
||||
add_chunk(sec, pos)
|
||||
splited_sec = re.split(r"(%s)" % dels, sec)
|
||||
for sub_sec in splited_sec:
|
||||
if re.match(f"^{dels}$", sub_sec):
|
||||
continue
|
||||
add_chunk(sub_sec, pos)
|
||||
|
||||
return cks
|
||||
|
||||
@ -576,8 +581,13 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
result_images[-1] = concat_img(result_images[-1], image)
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
dels = get_delimiters(delimiter)
|
||||
for text, image in zip(texts, images):
|
||||
add_chunk(text, image)
|
||||
splited_sec = re.split(r"(%s)" % dels, text)
|
||||
for sub_sec in splited_sec:
|
||||
if re.match(f"^{dels}$", sub_sec):
|
||||
continue
|
||||
add_chunk(text, image)
|
||||
|
||||
return cks, result_images
|
||||
|
||||
@ -640,8 +650,13 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
images[-1] = concat_img(images[-1], image)
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
dels = get_delimiters(delimiter)
|
||||
for sec, image in sections:
|
||||
add_chunk(sec, image, '')
|
||||
splited_sec = re.split(r"(%s)" % dels, sec)
|
||||
for sub_sec in splited_sec:
|
||||
if re.match(f"^{dels}$", sub_sec):
|
||||
continue
|
||||
add_chunk(sub_sec, image,"")
|
||||
|
||||
return cks, images
|
||||
|
||||
@ -649,3 +664,20 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
def extract_between(text: str, start_tag: str, end_tag: str) -> list[str]:
|
||||
pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
|
||||
return re.findall(pattern, text, flags=re.DOTALL)
|
||||
|
||||
|
||||
def get_delimiters(delimiters: str):
|
||||
dels = []
|
||||
s = 0
|
||||
for m in re.finditer(r"`([^`]+)`", delimiters, re.I):
|
||||
f, t = m.span()
|
||||
dels.append(m.group(1))
|
||||
dels.extend(list(delimiters[s: f]))
|
||||
s = t
|
||||
if s < len(delimiters):
|
||||
dels.extend(list(delimiters[s:]))
|
||||
dels = [re.escape(d) for d in dels if d]
|
||||
dels = [d for d in dels if d]
|
||||
dels_pattern = "|".join(dels)
|
||||
|
||||
return dels_pattern
|
||||
|
Loading…
x
Reference in New Issue
Block a user