diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 8757e3090..83c275309 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -31,29 +31,48 @@ class RAGFlowPptParser: return paragraph.text def __extract(self, shape): - if shape.shape_type == 19: - tb = shape.table - rows = [] - for i in range(1, len(tb.rows)): - rows.append("; ".join([tb.cell( - 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) - return "\n".join(rows) + try: + # First try to get text content + if hasattr(shape, 'has_text_frame') and shape.has_text_frame: + text_frame = shape.text_frame + texts = [] + for paragraph in text_frame.paragraphs: + if paragraph.text.strip(): + texts.append(self.__get_bulleted_text(paragraph)) + return "\n".join(texts) - if shape.has_text_frame: - text_frame = shape.text_frame - texts = [] - for paragraph in text_frame.paragraphs: - if paragraph.text.strip(): - texts.append(self.__get_bulleted_text(paragraph)) - return "\n".join(texts) + # Safely get shape_type + try: + shape_type = shape.shape_type + except NotImplementedError: + # If shape_type is not available, try to get text content + if hasattr(shape, 'text'): + return shape.text.strip() + return "" - if shape.shape_type == 6: - texts = [] - for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): - t = self.__extract(p) - if t: - texts.append(t) - return "\n".join(texts) + # Handle table + if shape_type == 19: + tb = shape.table + rows = [] + for i in range(1, len(tb.rows)): + rows.append("; ".join([tb.cell( + 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) + return "\n".join(rows) + + # Handle group shape + if shape_type == 6: + texts = [] + for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): + t = self.__extract_texts(p) + if t: + texts.append(t) + return "\n".join(texts) + + return "" + + except Exception as e: + logging.error(f"Error processing shape: {str(e)}") + return "" def __call__(self, fnm, from_page, to_page, callback=None): ppt = Presentation(fnm) if isinstance(