From 0e0ebaac5fc4e4370880b9f56bffe44f44d14553 Mon Sep 17 00:00:00 2001 From: fansir Date: Fri, 21 Mar 2025 18:42:36 +0800 Subject: [PATCH] Feat: Adds hierarchical title path tracking for tables in DOCX documents to improve context association (#6374) ### What problem does this PR solve? Adds hierarchical title path tracking for tables in DOCX documents to improve context association. Previously, extracted tables lacked positional context within document structure. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/app/naive.py | 110 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/rag/app/naive.py b/rag/app/naive.py index 5687b26ba..f771f665c 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -67,6 +67,111 @@ class Docx(DocxParser): line = re.sub(r"\u3000", " ", line).strip() return line + def __get_nearest_title(self, table_index, filename): + """Get the hierarchical title structure before the table""" + import re + from docx.text.paragraph import Paragraph + + titles = [] + blocks = [] + + # Get document name from filename parameter + doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename) + if not doc_name: + doc_name = "Untitled Document" + + # Collect all document blocks while maintaining document order + try: + # Iterate through all paragraphs and tables in document order + for i, block in enumerate(self.doc._element.body): + if block.tag.endswith('p'): # Paragraph + p = Paragraph(block, self.doc) + blocks.append(('p', i, p)) + elif block.tag.endswith('tbl'): # Table + blocks.append(('t', i, None)) # Table object will be retrieved later + except Exception as e: + logging.error(f"Error collecting blocks: {e}") + return "" + + # Find the target table position + target_table_pos = -1 + table_count = 0 + for i, (block_type, pos, _) in enumerate(blocks): + if block_type == 't': + if table_count == table_index: + target_table_pos = pos + break + table_count += 1 + + if target_table_pos == -1: + return "" # Target table not found + + # Find the nearest heading paragraph in reverse order + nearest_title = None + for i in range(len(blocks)-1, -1, -1): + block_type, pos, block = blocks[i] + if pos >= target_table_pos: # Skip blocks after the table + continue + + if block_type != 'p': + continue + + if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): + try: + level_match = re.search(r"(\d+)", block.style.name) + if level_match: + level = int(level_match.group(1)) + if level <= 7: # Support up to 7 heading levels + title_text = block.text.strip() + if title_text: # Avoid empty titles + nearest_title = (level, title_text) + break + except Exception as e: + logging.error(f"Error parsing heading level: {e}") + + if nearest_title: + # Add current title + titles.append(nearest_title) + current_level = nearest_title[0] + + # Find all parent headings, allowing cross-level search + while current_level > 1: + found = False + for i in range(len(blocks)-1, -1, -1): + block_type, pos, block = blocks[i] + if pos >= target_table_pos: # Skip blocks after the table + continue + + if block_type != 'p': + continue + + if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): + try: + level_match = re.search(r"(\d+)", block.style.name) + if level_match: + level = int(level_match.group(1)) + # Find any heading with a higher level + if level < current_level: + title_text = block.text.strip() + if title_text: # Avoid empty titles + titles.append((level, title_text)) + current_level = level + found = True + break + except Exception as e: + logging.error(f"Error parsing parent heading: {e}") + + if not found: # Break if no parent heading is found + break + + # Sort by level (ascending, from highest to lowest) + titles.sort(key=lambda x: x[0]) + # Organize titles (from highest to lowest) + hierarchy = [doc_name] + [t[1] for t in titles] + return " > ".join(hierarchy) + + return "" + def __call__(self, filename, binary=None, from_page=0, to_page=100000): self.doc = Document( filename) if not binary else Document(BytesIO(binary)) @@ -108,8 +213,11 @@ class Docx(DocxParser): new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines] tbls = [] - for tb in self.doc.tables: + for i, tb in enumerate(self.doc.tables): + title = self.__get_nearest_title(i, filename) html = "" + if title: + html += f"" for r in tb.rows: html += "" i = 0
Table Location: {title}