Feat：Optimize the table extraction logic in the Markdown parser: (#5663)

Enhance the recognition of both borderless and bordered Markdown tables. Add support for extracting HTML tables, including various scenarios with nested HTML tags. Improve performance by using conditional checks to reduce unnecessary regular expression matching. ### What problem does this PR solve? Optimize the table extraction logic in the Markdown parser: Enhance the recognition of both borderless and bordered Markdown tables. Add support for extracting HTML tables, including various scenarios with nested HTML tags. Improve performance by using conditional checks to reduce unnecessary regular expression matching. ### Type of change - [x] Performance Improvement Co-authored-by: wenju.li <wenju.li@deepctr.cn>
2025-08-12 19:18:59 +08:00 · 2025-03-07 17:02:35 +08:00 · 2025-03-07 17:02:35 +08:00 · 5b0e38060a
commit 5b0e38060a
parent 66938e0b68
1 changed files with 48 additions and 19 deletions
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@ -22,27 +22,56 @@ class RAGFlowMarkdownParser:
        self.chunk_token_num = int(chunk_token_num)
    def extract_tables_and_remainder(self, markdown_text):
-        # Standard Markdown table
+        tables = []
-        table_pattern = re.compile(
+        remainder = markdown_text
-            r'''
+        if "|" in markdown_text: # for optimize performance
-            (?:\n|^)                     
+            # Standard Markdown table
-            (?:\|.*?\|.*?\|.*?\n)        
+            border_table_pattern = re.compile(
-            (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
+                r'''
-            (?:\|.*?\|.*?\|.*?\n)+
+                (?:\n|^)                     
                (?:\|.*?\|.*?\|.*?\n)        
                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
                (?:\|.*?\|.*?\|.*?\n)+
            ''', re.VERBOSE)
-        tables = table_pattern.findall(markdown_text)
+            border_tables = border_table_pattern.findall(markdown_text)
-        remainder = table_pattern.sub('', markdown_text)
+            tables.extend(border_tables)
            remainder = border_table_pattern.sub('', remainder)
-        # Borderless Markdown table
+            # Borderless Markdown table
-        no_border_table_pattern = re.compile(
+            no_border_table_pattern = re.compile(
                r'''
                (?:\n|^)                 
                (?:\S.*?\|.*?\n)
                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
                (?:\S.*?\|.*?\n)+
                ''', re.VERBOSE)
            no_border_tables = no_border_table_pattern.findall(remainder)
            tables.extend(no_border_tables)
            remainder = no_border_table_pattern.sub('', remainder)
        if "<table>" in remainder.lower(): # for optimize performance
            #HTML table extraction - handle possible html/body wrapper tags
            html_table_pattern = re.compile(
            r'''
-            (?:\n|^)                 
+            (?:\n|^)
-            (?:\S.*?\|.*?\n)
+            \s*
-            (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
+            (?:
-            (?:\S.*?\|.*?\n)+
+                # case1: <html><body><table>...</table></body></html>
-            ''', re.VERBOSE)
+                (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
-        no_border_tables = no_border_table_pattern.findall(remainder)
+                |
-        tables.extend(no_border_tables)
+                # case2: <body><table>...</table></body>
-        remainder = no_border_table_pattern.sub('', remainder)
+                (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
                |
                # case3: only<table>...</table>
                (?:<table[^>]*>.*?</table>)
            )
            \s*
            (?=\n|$)
            ''',
            re.VERBOSE | re.DOTALL | re.IGNORECASE
            )
            html_tables = html_table_pattern.findall(remainder)
            tables.extend(html_tables)
            remainder = html_table_pattern.sub('', remainder)
        return remainder, tables