Refa: Optimize pptx shape extraction to reduce content loss (#6703)

### What problem does this PR solve? When parsing pptx files, some shapes do not contain the `shape_type` attribute, which causes the original code to throw an exception during extraction, leading to failure in content extraction. This optimization introduces handling logic for such anomalous shapes, providing a safer and more robust processing mechanism. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [x] Performance Improvement - [ ] Other (please describe):
2025-08-14 05:26:03 +08:00 · 2025-04-22 10:16:24 +08:00 · 2025-04-22 10:16:24 +08:00 · 10432a1be7
commit 10432a1be7
parent e7f83b13ca
1 changed files with 40 additions and 21 deletions
--- a/deepdoc/parser/ppt_parser.py
+++ b/deepdoc/parser/ppt_parser.py
@ -31,29 +31,48 @@ class RAGFlowPptParser:
            return paragraph.text
    def __extract(self, shape):
-        if shape.shape_type == 19:
+        try:
-            tb = shape.table
+            # First try to get text content
-            rows = []
+            if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
-            for i in range(1, len(tb.rows)):
+                text_frame = shape.text_frame
-                rows.append("; ".join([tb.cell(
+                texts = []
-                    0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+                for paragraph in text_frame.paragraphs:
-            return "\n".join(rows)
+                    if paragraph.text.strip():
                        texts.append(self.__get_bulleted_text(paragraph))
                return "\n".join(texts)
-        if shape.has_text_frame:
+            # Safely get shape_type
-            text_frame = shape.text_frame
+            try:
-            texts = []
+                shape_type = shape.shape_type
-            for paragraph in text_frame.paragraphs:
+            except NotImplementedError:
-                if paragraph.text.strip():
+                # If shape_type is not available, try to get text content
-                    texts.append(self.__get_bulleted_text(paragraph))
+                if hasattr(shape, 'text'):
-            return "\n".join(texts)
+                    return shape.text.strip()
                return ""
-        if shape.shape_type == 6:
+            # Handle table
-            texts = []
+            if shape_type == 19:
-            for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
+                tb = shape.table
-                t = self.__extract(p)
+                rows = []
-                if t:
+                for i in range(1, len(tb.rows)):
-                    texts.append(t)
+                    rows.append("; ".join([tb.cell(
-            return "\n".join(texts)
+                        0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
                return "\n".join(rows)
            # Handle group shape
            if shape_type == 6:
                texts = []
                for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
                    t = self.__extract_texts(p)
                    if t:
                        texts.append(t)
                return "\n".join(texts)
            return ""
        except Exception as e:
            logging.error(f"Error processing shape: {str(e)}")
            return ""
    def __call__(self, fnm, from_page, to_page, callback=None):
        ppt = Presentation(fnm) if isinstance(