From 5b0e38060a005e629ce191984f8daeb8afd387bf Mon Sep 17 00:00:00 2001
From: liwenju0 <like4hub@gmail.com>
Date: Fri, 7 Mar 2025 17:02:35 +0800
Subject: [PATCH] =?UTF-8?q?Feat=EF=BC=9AOptimize=20the=20table=20extractio?=
 =?UTF-8?q?n=20logic=20in=20the=20Markdown=20parser:=20(#5663)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enhance the recognition of both borderless and bordered Markdown tables.
Add support for extracting HTML tables, including various scenarios with
nested HTML tags. Improve performance by using conditional checks to
reduce unnecessary regular expression matching.

### What problem does this PR solve?

Optimize the table extraction logic in the Markdown parser:
Enhance the recognition of both borderless and bordered Markdown tables.
Add support for extracting HTML tables, including various scenarios with
nested HTML tags.
Improve performance by using conditional checks to reduce unnecessary
regular expression matching.

### Type of change

- [x] Performance Improvement

Co-authored-by: wenju.li <wenju.li@deepctr.cn>
---
 deepdoc/parser/markdown_parser.py | 67 ++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 19 deletions(-)
diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py
index d5d138377..193e5b960 100644
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -22,27 +22,56 @@ class RAGFlowMarkdownParser:
         self.chunk_token_num = int(chunk_token_num)
 
     def extract_tables_and_remainder(self, markdown_text):
-        # Standard Markdown table
-        table_pattern = re.compile(
-            r'''
-            (?:\n|^)                     
-            (?:\|.*?\|.*?\|.*?\n)        
-            (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
-            (?:\|.*?\|.*?\|.*?\n)+
+        tables = []
+        remainder = markdown_text
+        if "|" in markdown_text: # for optimize performance
+            # Standard Markdown table
+            border_table_pattern = re.compile(
+                r'''
+                (?:\n|^)                     
+                (?:\|.*?\|.*?\|.*?\n)        
+                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
+                (?:\|.*?\|.*?\|.*?\n)+
             ''', re.VERBOSE)
-        tables = table_pattern.findall(markdown_text)
-        remainder = table_pattern.sub('', markdown_text)
+            border_tables = border_table_pattern.findall(markdown_text)
+            tables.extend(border_tables)
+            remainder = border_table_pattern.sub('', remainder)
 
-        # Borderless Markdown table
-        no_border_table_pattern = re.compile(
+            # Borderless Markdown table
+            no_border_table_pattern = re.compile(
+                r'''
+                (?:\n|^)                 
+                (?:\S.*?\|.*?\n)
+                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
+                (?:\S.*?\|.*?\n)+
+                ''', re.VERBOSE)
+            no_border_tables = no_border_table_pattern.findall(remainder)
+            tables.extend(no_border_tables)
+            remainder = no_border_table_pattern.sub('', remainder)
+
+        if "<table>" in remainder.lower(): # for optimize performance
+            #HTML table extraction - handle possible html/body wrapper tags
+            html_table_pattern = re.compile(
             r'''
-            (?:\n|^)                 
-            (?:\S.*?\|.*?\n)
-            (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
-            (?:\S.*?\|.*?\n)+
-            ''', re.VERBOSE)
-        no_border_tables = no_border_table_pattern.findall(remainder)
-        tables.extend(no_border_tables)
-        remainder = no_border_table_pattern.sub('', remainder)
+            (?:\n|^)
+            \s*
+            (?:
+                # case1: <html><body><table>...</table></body></html>
+                (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
+                |
+                # case2: <body><table>...</table></body>
+                (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
+                |
+                # case3: only<table>...</table>
+                (?:<table[^>]*>.*?</table>)
+            )
+            \s*
+            (?=\n|$)
+            ''',
+            re.VERBOSE | re.DOTALL | re.IGNORECASE
+            )
+            html_tables = html_table_pattern.findall(remainder)
+            tables.extend(html_tables)
+            remainder = html_table_pattern.sub('', remainder)
 
         return remainder, tables