mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-15 00:45:57 +08:00
Feat: Adds hierarchical title path tracking for tables in DOCX documents to improve context association (#6374)
### What problem does this PR solve? Adds hierarchical title path tracking for tables in DOCX documents to improve context association. Previously, extracted tables lacked positional context within document structure. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
8b7e53e643
commit
0e0ebaac5f
110
rag/app/naive.py
110
rag/app/naive.py
@ -67,6 +67,111 @@ class Docx(DocxParser):
|
||||
line = re.sub(r"\u3000", " ", line).strip()
|
||||
return line
|
||||
|
||||
def __get_nearest_title(self, table_index, filename):
|
||||
"""Get the hierarchical title structure before the table"""
|
||||
import re
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
titles = []
|
||||
blocks = []
|
||||
|
||||
# Get document name from filename parameter
|
||||
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
|
||||
if not doc_name:
|
||||
doc_name = "Untitled Document"
|
||||
|
||||
# Collect all document blocks while maintaining document order
|
||||
try:
|
||||
# Iterate through all paragraphs and tables in document order
|
||||
for i, block in enumerate(self.doc._element.body):
|
||||
if block.tag.endswith('p'): # Paragraph
|
||||
p = Paragraph(block, self.doc)
|
||||
blocks.append(('p', i, p))
|
||||
elif block.tag.endswith('tbl'): # Table
|
||||
blocks.append(('t', i, None)) # Table object will be retrieved later
|
||||
except Exception as e:
|
||||
logging.error(f"Error collecting blocks: {e}")
|
||||
return ""
|
||||
|
||||
# Find the target table position
|
||||
target_table_pos = -1
|
||||
table_count = 0
|
||||
for i, (block_type, pos, _) in enumerate(blocks):
|
||||
if block_type == 't':
|
||||
if table_count == table_index:
|
||||
target_table_pos = pos
|
||||
break
|
||||
table_count += 1
|
||||
|
||||
if target_table_pos == -1:
|
||||
return "" # Target table not found
|
||||
|
||||
# Find the nearest heading paragraph in reverse order
|
||||
nearest_title = None
|
||||
for i in range(len(blocks)-1, -1, -1):
|
||||
block_type, pos, block = blocks[i]
|
||||
if pos >= target_table_pos: # Skip blocks after the table
|
||||
continue
|
||||
|
||||
if block_type != 'p':
|
||||
continue
|
||||
|
||||
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||
try:
|
||||
level_match = re.search(r"(\d+)", block.style.name)
|
||||
if level_match:
|
||||
level = int(level_match.group(1))
|
||||
if level <= 7: # Support up to 7 heading levels
|
||||
title_text = block.text.strip()
|
||||
if title_text: # Avoid empty titles
|
||||
nearest_title = (level, title_text)
|
||||
break
|
||||
except Exception as e:
|
||||
logging.error(f"Error parsing heading level: {e}")
|
||||
|
||||
if nearest_title:
|
||||
# Add current title
|
||||
titles.append(nearest_title)
|
||||
current_level = nearest_title[0]
|
||||
|
||||
# Find all parent headings, allowing cross-level search
|
||||
while current_level > 1:
|
||||
found = False
|
||||
for i in range(len(blocks)-1, -1, -1):
|
||||
block_type, pos, block = blocks[i]
|
||||
if pos >= target_table_pos: # Skip blocks after the table
|
||||
continue
|
||||
|
||||
if block_type != 'p':
|
||||
continue
|
||||
|
||||
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||
try:
|
||||
level_match = re.search(r"(\d+)", block.style.name)
|
||||
if level_match:
|
||||
level = int(level_match.group(1))
|
||||
# Find any heading with a higher level
|
||||
if level < current_level:
|
||||
title_text = block.text.strip()
|
||||
if title_text: # Avoid empty titles
|
||||
titles.append((level, title_text))
|
||||
current_level = level
|
||||
found = True
|
||||
break
|
||||
except Exception as e:
|
||||
logging.error(f"Error parsing parent heading: {e}")
|
||||
|
||||
if not found: # Break if no parent heading is found
|
||||
break
|
||||
|
||||
# Sort by level (ascending, from highest to lowest)
|
||||
titles.sort(key=lambda x: x[0])
|
||||
# Organize titles (from highest to lowest)
|
||||
hierarchy = [doc_name] + [t[1] for t in titles]
|
||||
return " > ".join(hierarchy)
|
||||
|
||||
return ""
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
@ -108,8 +213,11 @@ class Docx(DocxParser):
|
||||
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
|
||||
|
||||
tbls = []
|
||||
for tb in self.doc.tables:
|
||||
for i, tb in enumerate(self.doc.tables):
|
||||
title = self.__get_nearest_title(i, filename)
|
||||
html = "<table>"
|
||||
if title:
|
||||
html += f"<caption>Table Location: {title}</caption>"
|
||||
for r in tb.rows:
|
||||
html += "<tr>"
|
||||
i = 0
|
||||
|
Loading…
x
Reference in New Issue
Block a user