Feat:Optimize the table extraction logic in the Markdown parser: (#5663)

Enhance the recognition of both borderless and bordered Markdown tables.
Add support for extracting HTML tables, including various scenarios with
nested HTML tags. Improve performance by using conditional checks to
reduce unnecessary regular expression matching.

### What problem does this PR solve?

Optimize the table extraction logic in the Markdown parser:
Enhance the recognition of both borderless and bordered Markdown tables.
Add support for extracting HTML tables, including various scenarios with
nested HTML tags.
Improve performance by using conditional checks to reduce unnecessary
regular expression matching.

### Type of change

- [x] Performance Improvement

Co-authored-by: wenju.li <wenju.li@deepctr.cn>
This commit is contained in:
liwenju0 2025-03-07 17:02:35 +08:00 committed by GitHub
parent 66938e0b68
commit 5b0e38060a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -22,27 +22,56 @@ class RAGFlowMarkdownParser:
self.chunk_token_num = int(chunk_token_num) self.chunk_token_num = int(chunk_token_num)
def extract_tables_and_remainder(self, markdown_text): def extract_tables_and_remainder(self, markdown_text):
# Standard Markdown table tables = []
table_pattern = re.compile( remainder = markdown_text
r''' if "|" in markdown_text: # for optimize performance
(?:\n|^) # Standard Markdown table
(?:\|.*?\|.*?\|.*?\n) border_table_pattern = re.compile(
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) r'''
(?:\|.*?\|.*?\|.*?\n)+ (?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE) ''', re.VERBOSE)
tables = table_pattern.findall(markdown_text) border_tables = border_table_pattern.findall(markdown_text)
remainder = table_pattern.sub('', markdown_text) tables.extend(border_tables)
remainder = border_table_pattern.sub('', remainder)
# Borderless Markdown table # Borderless Markdown table
no_border_table_pattern = re.compile( no_border_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)
if "<table>" in remainder.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile(
r''' r'''
(?:\n|^) (?:\n|^)
(?:\S.*?\|.*?\n) \s*
(?:(?:\s*[:-]+[-| :]*\s*).*?\n) (?:
(?:\S.*?\|.*?\n)+ # case1: <html><body><table>...</table></body></html>
''', re.VERBOSE) (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
no_border_tables = no_border_table_pattern.findall(remainder) |
tables.extend(no_border_tables) # case2: <body><table>...</table></body>
remainder = no_border_table_pattern.sub('', remainder) (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
# case3: only<table>...</table>
(?:<table[^>]*>.*?</table>)
)
\s*
(?=\n|$)
''',
re.VERBOSE | re.DOTALL | re.IGNORECASE
)
html_tables = html_table_pattern.findall(remainder)
tables.extend(html_tables)
remainder = html_table_pattern.sub('', remainder)
return remainder, tables return remainder, tables