mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-06-04 11:24:00 +08:00
Fix: Unnecessary truncation in markdown parser (#7972)
### What problem does this PR solve? Fix unnecessary truncation in markdown parser. So that markdown can work perfectly like [this](https://github.com/infiniflow/ragflow/issues/7824#issuecomment-2921312576) in #7824, supporting multiple special delimiters. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
31f4d44c73
commit
bd4678bca6
@ -32,7 +32,6 @@ from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, Mark
|
|||||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
|
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
|
||||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
||||||
from rag.utils import num_tokens_from_string
|
|
||||||
|
|
||||||
|
|
||||||
class Docx(DocxParser):
|
class Docx(DocxParser):
|
||||||
@ -335,17 +334,13 @@ class Markdown(MarkdownParser):
|
|||||||
sections = []
|
sections = []
|
||||||
tbls = []
|
tbls = []
|
||||||
for sec in remainder.split("\n"):
|
for sec in remainder.split("\n"):
|
||||||
if num_tokens_from_string(sec) > 3 * self.chunk_token_num:
|
if sec.strip().find("#") == 0:
|
||||||
sections.append((sec[:int(len(sec) / 2)], ""))
|
sections.append((sec, ""))
|
||||||
sections.append((sec[int(len(sec) / 2):], ""))
|
elif sections and sections[-1][0].strip().find("#") == 0:
|
||||||
|
sec_, _ = sections.pop(-1)
|
||||||
|
sections.append((sec_ + "\n" + sec, ""))
|
||||||
else:
|
else:
|
||||||
if sec.strip().find("#") == 0:
|
sections.append((sec, ""))
|
||||||
sections.append((sec, ""))
|
|
||||||
elif sections and sections[-1][0].strip().find("#") == 0:
|
|
||||||
sec_, _ = sections.pop(-1)
|
|
||||||
sections.append((sec_ + "\n" + sec, ""))
|
|
||||||
else:
|
|
||||||
sections.append((sec, ""))
|
|
||||||
for table in tables:
|
for table in tables:
|
||||||
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
||||||
return sections, tbls
|
return sections, tbls
|
||||||
|
@ -545,7 +545,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|||||||
add_chunk(sub_sec, pos)
|
add_chunk(sub_sec, pos)
|
||||||
|
|
||||||
return cks
|
return cks
|
||||||
|
|
||||||
|
|
||||||
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"):
|
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"):
|
||||||
if not texts or len(texts) != len(images):
|
if not texts or len(texts) != len(images):
|
||||||
@ -676,6 +676,8 @@ def get_delimiters(delimiters: str):
|
|||||||
s = t
|
s = t
|
||||||
if s < len(delimiters):
|
if s < len(delimiters):
|
||||||
dels.extend(list(delimiters[s:]))
|
dels.extend(list(delimiters[s:]))
|
||||||
|
|
||||||
|
dels.sort(key=lambda x: -len(x))
|
||||||
dels = [re.escape(d) for d in dels if d]
|
dels = [re.escape(d) for d in dels if d]
|
||||||
dels = [d for d in dels if d]
|
dels = [d for d in dels if d]
|
||||||
dels_pattern = "|".join(dels)
|
dels_pattern = "|".join(dels)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user