diff --git a/rag/app/naive.py b/rag/app/naive.py index 28e3bbbcc..551b42b6f 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -32,7 +32,6 @@ from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, Mark from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table -from rag.utils import num_tokens_from_string class Docx(DocxParser): @@ -335,17 +334,13 @@ class Markdown(MarkdownParser): sections = [] tbls = [] for sec in remainder.split("\n"): - if num_tokens_from_string(sec) > 3 * self.chunk_token_num: - sections.append((sec[:int(len(sec) / 2)], "")) - sections.append((sec[int(len(sec) / 2):], "")) + if sec.strip().find("#") == 0: + sections.append((sec, "")) + elif sections and sections[-1][0].strip().find("#") == 0: + sec_, _ = sections.pop(-1) + sections.append((sec_ + "\n" + sec, "")) else: - if sec.strip().find("#") == 0: - sections.append((sec, "")) - elif sections and sections[-1][0].strip().find("#") == 0: - sec_, _ = sections.pop(-1) - sections.append((sec_ + "\n" + sec, "")) - else: - sections.append((sec, "")) + sections.append((sec, "")) for table in tables: tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) return sections, tbls diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 34881d924..06f49623e 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -545,7 +545,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): add_chunk(sub_sec, pos) return cks - + def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"): if not texts or len(texts) != len(images): @@ -676,6 +676,8 @@ def get_delimiters(delimiters: str): s = t if s < len(delimiters): dels.extend(list(delimiters[s:])) + + dels.sort(key=lambda x: -len(x)) dels = [re.escape(d) for d in dels if d] dels = [d for d in dels if d] dels_pattern = "|".join(dels)