diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index d5d138377..193e5b960 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -22,27 +22,56 @@ class RAGFlowMarkdownParser: self.chunk_token_num = int(chunk_token_num) def extract_tables_and_remainder(self, markdown_text): - # Standard Markdown table - table_pattern = re.compile( - r''' - (?:\n|^) - (?:\|.*?\|.*?\|.*?\n) - (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) - (?:\|.*?\|.*?\|.*?\n)+ + tables = [] + remainder = markdown_text + if "|" in markdown_text: # for optimize performance + # Standard Markdown table + border_table_pattern = re.compile( + r''' + (?:\n|^) + (?:\|.*?\|.*?\|.*?\n) + (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) + (?:\|.*?\|.*?\|.*?\n)+ ''', re.VERBOSE) - tables = table_pattern.findall(markdown_text) - remainder = table_pattern.sub('', markdown_text) + border_tables = border_table_pattern.findall(markdown_text) + tables.extend(border_tables) + remainder = border_table_pattern.sub('', remainder) - # Borderless Markdown table - no_border_table_pattern = re.compile( + # Borderless Markdown table + no_border_table_pattern = re.compile( + r''' + (?:\n|^) + (?:\S.*?\|.*?\n) + (?:(?:\s*[:-]+[-| :]*\s*).*?\n) + (?:\S.*?\|.*?\n)+ + ''', re.VERBOSE) + no_border_tables = no_border_table_pattern.findall(remainder) + tables.extend(no_border_tables) + remainder = no_border_table_pattern.sub('', remainder) + + if "" in remainder.lower(): # for optimize performance + #HTML table extraction - handle possible html/body wrapper tags + html_table_pattern = re.compile( r''' - (?:\n|^) - (?:\S.*?\|.*?\n) - (?:(?:\s*[:-]+[-| :]*\s*).*?\n) - (?:\S.*?\|.*?\n)+ - ''', re.VERBOSE) - no_border_tables = no_border_table_pattern.findall(remainder) - tables.extend(no_border_tables) - remainder = no_border_table_pattern.sub('', remainder) + (?:\n|^) + \s* + (?: + # case1:
...
+ (?:]*>\s*]*>\s*]*>.*?\s*\s*) + | + # case2: ...
+ (?:]*>\s*]*>.*?\s*) + | + # case3: only...
+ (?:]*>.*?) + ) + \s* + (?=\n|$) + ''', + re.VERBOSE | re.DOTALL | re.IGNORECASE + ) + html_tables = html_table_pattern.findall(remainder) + tables.extend(html_tables) + remainder = html_table_pattern.sub('', remainder) return remainder, tables