Refa: Optimize pptx shape extraction to reduce content loss (#6703)

### What problem does this PR solve? When parsing pptx files, some shapes do not contain the `shape_type` attribute, which causes the original code to throw an exception during extraction, leading to failure in content extraction. This optimization introduces handling logic for such anomalous shapes, providing a safer and more robust processing mechanism. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [x] Performance Improvement - [ ] Other (please describe):
2025-08-12 21:39:00 +08:00 · 2025-04-22 10:16:24 +08:00 · 2025-04-22 10:16:24 +08:00 · 10432a1be7
commit 10432a1be7
parent e7f83b13ca
1 changed files with 40 additions and 21 deletions
--- a/deepdoc/parser/ppt_parser.py
+++ b/deepdoc/parser/ppt_parser.py
@ -31,29 +31,48 @@ class RAGFlowPptParser:
            return paragraph.text

    def __extract(self, shape):
-        if shape.shape_type == 19:
-            tb = shape.table
-            rows = []
-            for i in range(1, len(tb.rows)):
-                rows.append("; ".join([tb.cell(
-                    0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
-            return "\n".join(rows)
+        try:
+            # First try to get text content
+            if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
+                text_frame = shape.text_frame
+                texts = []
+                for paragraph in text_frame.paragraphs:
+                    if paragraph.text.strip():
+                        texts.append(self.__get_bulleted_text(paragraph))
+                return "\n".join(texts)

-        if shape.has_text_frame:
-            text_frame = shape.text_frame
-            texts = []
-            for paragraph in text_frame.paragraphs:
-                if paragraph.text.strip():
-                    texts.append(self.__get_bulleted_text(paragraph))
-            return "\n".join(texts)
+            # Safely get shape_type
+            try:
+                shape_type = shape.shape_type
+            except NotImplementedError:
+                # If shape_type is not available, try to get text content
+                if hasattr(shape, 'text'):
+                    return shape.text.strip()
+                return ""

-        if shape.shape_type == 6:
-            texts = []
-            for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
-                t = self.__extract(p)
-                if t:
-                    texts.append(t)
-            return "\n".join(texts)
+            # Handle table
+            if shape_type == 19:
+                tb = shape.table
+                rows = []
+                for i in range(1, len(tb.rows)):
+                    rows.append("; ".join([tb.cell(
+                        0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+                return "\n".join(rows)
+
+            # Handle group shape
+            if shape_type == 6:
+                texts = []
+                for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
+                    t = self.__extract_texts(p)
+                    if t:
+                        texts.append(t)
+                return "\n".join(texts)
+
+            return ""
+
+        except Exception as e:
+            logging.error(f"Error processing shape: {str(e)}")
+            return ""

    def __call__(self, fnm, from_page, to_page, callback=None):
        ppt = Presentation(fnm) if isinstance(