From 5b0e38060a005e629ce191984f8daeb8afd387bf Mon Sep 17 00:00:00 2001 From: liwenju0 Date: Fri, 7 Mar 2025 17:02:35 +0800 Subject: [PATCH] =?UTF-8?q?Feat=EF=BC=9AOptimize=20the=20table=20extractio?= =?UTF-8?q?n=20logic=20in=20the=20Markdown=20parser:=20(#5663)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance the recognition of both borderless and bordered Markdown tables. Add support for extracting HTML tables, including various scenarios with nested HTML tags. Improve performance by using conditional checks to reduce unnecessary regular expression matching. ### What problem does this PR solve? Optimize the table extraction logic in the Markdown parser: Enhance the recognition of both borderless and bordered Markdown tables. Add support for extracting HTML tables, including various scenarios with nested HTML tags. Improve performance by using conditional checks to reduce unnecessary regular expression matching. ### Type of change - [x] Performance Improvement Co-authored-by: wenju.li --- deepdoc/parser/markdown_parser.py | 67 ++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index d5d138377..193e5b960 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -22,27 +22,56 @@ class RAGFlowMarkdownParser: self.chunk_token_num = int(chunk_token_num) def extract_tables_and_remainder(self, markdown_text): - # Standard Markdown table - table_pattern = re.compile( - r''' - (?:\n|^) - (?:\|.*?\|.*?\|.*?\n) - (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) - (?:\|.*?\|.*?\|.*?\n)+ + tables = [] + remainder = markdown_text + if "|" in markdown_text: # for optimize performance + # Standard Markdown table + border_table_pattern = re.compile( + r''' + (?:\n|^) + (?:\|.*?\|.*?\|.*?\n) + (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) + (?:\|.*?\|.*?\|.*?\n)+ ''', re.VERBOSE) - tables = table_pattern.findall(markdown_text) - remainder = table_pattern.sub('', markdown_text) + border_tables = border_table_pattern.findall(markdown_text) + tables.extend(border_tables) + remainder = border_table_pattern.sub('', remainder) - # Borderless Markdown table - no_border_table_pattern = re.compile( + # Borderless Markdown table + no_border_table_pattern = re.compile( + r''' + (?:\n|^) + (?:\S.*?\|.*?\n) + (?:(?:\s*[:-]+[-| :]*\s*).*?\n) + (?:\S.*?\|.*?\n)+ + ''', re.VERBOSE) + no_border_tables = no_border_table_pattern.findall(remainder) + tables.extend(no_border_tables) + remainder = no_border_table_pattern.sub('', remainder) + + if "" in remainder.lower(): # for optimize performance + #HTML table extraction - handle possible html/body wrapper tags + html_table_pattern = re.compile( r''' - (?:\n|^) - (?:\S.*?\|.*?\n) - (?:(?:\s*[:-]+[-| :]*\s*).*?\n) - (?:\S.*?\|.*?\n)+ - ''', re.VERBOSE) - no_border_tables = no_border_table_pattern.findall(remainder) - tables.extend(no_border_tables) - remainder = no_border_table_pattern.sub('', remainder) + (?:\n|^) + \s* + (?: + # case1:
...
+ (?:]*>\s*]*>\s*]*>.*?\s*\s*) + | + # case2: ...
+ (?:]*>\s*]*>.*?\s*) + | + # case3: only...
+ (?:]*>.*?) + ) + \s* + (?=\n|$) + ''', + re.VERBOSE | re.DOTALL | re.IGNORECASE + ) + html_tables = html_table_pattern.findall(remainder) + tables.extend(html_tables) + remainder = html_table_pattern.sub('', remainder) return remainder, tables