mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 19:18:59 +08:00
Feat:Optimize the table extraction logic in the Markdown parser: (#5663)
Enhance the recognition of both borderless and bordered Markdown tables. Add support for extracting HTML tables, including various scenarios with nested HTML tags. Improve performance by using conditional checks to reduce unnecessary regular expression matching. ### What problem does this PR solve? Optimize the table extraction logic in the Markdown parser: Enhance the recognition of both borderless and bordered Markdown tables. Add support for extracting HTML tables, including various scenarios with nested HTML tags. Improve performance by using conditional checks to reduce unnecessary regular expression matching. ### Type of change - [x] Performance Improvement Co-authored-by: wenju.li <wenju.li@deepctr.cn>
This commit is contained in:
parent
66938e0b68
commit
5b0e38060a
@ -22,27 +22,56 @@ class RAGFlowMarkdownParser:
|
|||||||
self.chunk_token_num = int(chunk_token_num)
|
self.chunk_token_num = int(chunk_token_num)
|
||||||
|
|
||||||
def extract_tables_and_remainder(self, markdown_text):
|
def extract_tables_and_remainder(self, markdown_text):
|
||||||
# Standard Markdown table
|
tables = []
|
||||||
table_pattern = re.compile(
|
remainder = markdown_text
|
||||||
r'''
|
if "|" in markdown_text: # for optimize performance
|
||||||
(?:\n|^)
|
# Standard Markdown table
|
||||||
(?:\|.*?\|.*?\|.*?\n)
|
border_table_pattern = re.compile(
|
||||||
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
r'''
|
||||||
(?:\|.*?\|.*?\|.*?\n)+
|
(?:\n|^)
|
||||||
|
(?:\|.*?\|.*?\|.*?\n)
|
||||||
|
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
||||||
|
(?:\|.*?\|.*?\|.*?\n)+
|
||||||
''', re.VERBOSE)
|
''', re.VERBOSE)
|
||||||
tables = table_pattern.findall(markdown_text)
|
border_tables = border_table_pattern.findall(markdown_text)
|
||||||
remainder = table_pattern.sub('', markdown_text)
|
tables.extend(border_tables)
|
||||||
|
remainder = border_table_pattern.sub('', remainder)
|
||||||
|
|
||||||
# Borderless Markdown table
|
# Borderless Markdown table
|
||||||
no_border_table_pattern = re.compile(
|
no_border_table_pattern = re.compile(
|
||||||
|
r'''
|
||||||
|
(?:\n|^)
|
||||||
|
(?:\S.*?\|.*?\n)
|
||||||
|
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
||||||
|
(?:\S.*?\|.*?\n)+
|
||||||
|
''', re.VERBOSE)
|
||||||
|
no_border_tables = no_border_table_pattern.findall(remainder)
|
||||||
|
tables.extend(no_border_tables)
|
||||||
|
remainder = no_border_table_pattern.sub('', remainder)
|
||||||
|
|
||||||
|
if "<table>" in remainder.lower(): # for optimize performance
|
||||||
|
#HTML table extraction - handle possible html/body wrapper tags
|
||||||
|
html_table_pattern = re.compile(
|
||||||
r'''
|
r'''
|
||||||
(?:\n|^)
|
(?:\n|^)
|
||||||
(?:\S.*?\|.*?\n)
|
\s*
|
||||||
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
(?:
|
||||||
(?:\S.*?\|.*?\n)+
|
# case1: <html><body><table>...</table></body></html>
|
||||||
''', re.VERBOSE)
|
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
||||||
no_border_tables = no_border_table_pattern.findall(remainder)
|
|
|
||||||
tables.extend(no_border_tables)
|
# case2: <body><table>...</table></body>
|
||||||
remainder = no_border_table_pattern.sub('', remainder)
|
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
||||||
|
|
|
||||||
|
# case3: only<table>...</table>
|
||||||
|
(?:<table[^>]*>.*?</table>)
|
||||||
|
)
|
||||||
|
\s*
|
||||||
|
(?=\n|$)
|
||||||
|
''',
|
||||||
|
re.VERBOSE | re.DOTALL | re.IGNORECASE
|
||||||
|
)
|
||||||
|
html_tables = html_table_pattern.findall(remainder)
|
||||||
|
tables.extend(html_tables)
|
||||||
|
remainder = html_table_pattern.sub('', remainder)
|
||||||
|
|
||||||
return remainder, tables
|
return remainder, tables
|
||||||
|
Loading…
x
Reference in New Issue
Block a user