Feat: Adds hierarchical title path tracking for tables in DOCX documents to improve context association (#6374)

### What problem does this PR solve? Adds hierarchical title path tracking for tables in DOCX documents to improve context association. Previously, extracted tables lacked positional context within document structure. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-08-15 00:45:57 +08:00 · 2025-03-21 18:42:36 +08:00 · 2025-03-21 18:42:36 +08:00 · 0e0ebaac5f
commit 0e0ebaac5f
parent 8b7e53e643
1 changed files with 109 additions and 1 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -67,6 +67,111 @@ class Docx(DocxParser):
        line = re.sub(r"\u3000", " ", line).strip()
        return line

+    def __get_nearest_title(self, table_index, filename):
+        """Get the hierarchical title structure before the table"""
+        import re
+        from docx.text.paragraph import Paragraph
+        
+        titles = []
+        blocks = []
+        
+        # Get document name from filename parameter
+        doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
+        if not doc_name:
+            doc_name = "Untitled Document"
+            
+        # Collect all document blocks while maintaining document order
+        try:
+            # Iterate through all paragraphs and tables in document order
+            for i, block in enumerate(self.doc._element.body):
+                if block.tag.endswith('p'):  # Paragraph
+                    p = Paragraph(block, self.doc)
+                    blocks.append(('p', i, p))
+                elif block.tag.endswith('tbl'):  # Table
+                    blocks.append(('t', i, None))  # Table object will be retrieved later
+        except Exception as e:
+            logging.error(f"Error collecting blocks: {e}")
+            return ""
+            
+        # Find the target table position
+        target_table_pos = -1
+        table_count = 0
+        for i, (block_type, pos, _) in enumerate(blocks):
+            if block_type == 't':
+                if table_count == table_index:
+                    target_table_pos = pos
+                    break
+                table_count += 1
+                
+        if target_table_pos == -1:
+            return ""  # Target table not found
+            
+        # Find the nearest heading paragraph in reverse order
+        nearest_title = None
+        for i in range(len(blocks)-1, -1, -1):
+            block_type, pos, block = blocks[i]
+            if pos >= target_table_pos:  # Skip blocks after the table
+                continue
+                
+            if block_type != 'p':
+                continue
+                
+            if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
+                try:
+                    level_match = re.search(r"(\d+)", block.style.name)
+                    if level_match:
+                        level = int(level_match.group(1))
+                        if level <= 7:  # Support up to 7 heading levels
+                            title_text = block.text.strip()
+                            if title_text:  # Avoid empty titles
+                                nearest_title = (level, title_text)
+                                break
+                except Exception as e:
+                    logging.error(f"Error parsing heading level: {e}")
+        
+        if nearest_title:
+            # Add current title
+            titles.append(nearest_title)
+            current_level = nearest_title[0]
+            
+            # Find all parent headings, allowing cross-level search
+            while current_level > 1:
+                found = False
+                for i in range(len(blocks)-1, -1, -1):
+                    block_type, pos, block = blocks[i]
+                    if pos >= target_table_pos:  # Skip blocks after the table
+                        continue
+                        
+                    if block_type != 'p':
+                        continue
+                        
+                    if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
+                        try:
+                            level_match = re.search(r"(\d+)", block.style.name)
+                            if level_match:
+                                level = int(level_match.group(1))
+                                # Find any heading with a higher level
+                                if level < current_level:  
+                                    title_text = block.text.strip()
+                                    if title_text:  # Avoid empty titles
+                                        titles.append((level, title_text))
+                                        current_level = level
+                                        found = True
+                                        break
+                        except Exception as e:
+                            logging.error(f"Error parsing parent heading: {e}")
+                            
+                if not found:  # Break if no parent heading is found
+                    break
+            
+            # Sort by level (ascending, from highest to lowest)
+            titles.sort(key=lambda x: x[0])
+            # Organize titles (from highest to lowest)
+            hierarchy = [doc_name] + [t[1] for t in titles]
+            return " > ".join(hierarchy)
+            
+        return ""
+
    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
        self.doc = Document(
            filename) if not binary else Document(BytesIO(binary))
@ -108,8 +213,11 @@ class Docx(DocxParser):
        new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]

        tbls = []
-        for tb in self.doc.tables:
+        for i, tb in enumerate(self.doc.tables):
+            title = self.__get_nearest_title(i, filename)
            html = "<table>"
+            if title:
+                html += f"<caption>Table Location: {title}</caption>"
            for r in tb.rows:
                html += "<tr>"
                i = 0