diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index d5d138377..193e5b960 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -22,27 +22,56 @@ class RAGFlowMarkdownParser: self.chunk_token_num = int(chunk_token_num) def extract_tables_and_remainder(self, markdown_text): - # Standard Markdown table - table_pattern = re.compile( - r''' - (?:\n|^) - (?:\|.*?\|.*?\|.*?\n) - (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) - (?:\|.*?\|.*?\|.*?\n)+ + tables = [] + remainder = markdown_text + if "|" in markdown_text: # for optimize performance + # Standard Markdown table + border_table_pattern = re.compile( + r''' + (?:\n|^) + (?:\|.*?\|.*?\|.*?\n) + (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) + (?:\|.*?\|.*?\|.*?\n)+ ''', re.VERBOSE) - tables = table_pattern.findall(markdown_text) - remainder = table_pattern.sub('', markdown_text) + border_tables = border_table_pattern.findall(markdown_text) + tables.extend(border_tables) + remainder = border_table_pattern.sub('', remainder) - # Borderless Markdown table - no_border_table_pattern = re.compile( + # Borderless Markdown table + no_border_table_pattern = re.compile( + r''' + (?:\n|^) + (?:\S.*?\|.*?\n) + (?:(?:\s*[:-]+[-| :]*\s*).*?\n) + (?:\S.*?\|.*?\n)+ + ''', re.VERBOSE) + no_border_tables = no_border_table_pattern.findall(remainder) + tables.extend(no_border_tables) + remainder = no_border_table_pattern.sub('', remainder) + + if "