Refa: Optimize pptx shape extraction to reduce content loss (#6703)

### What problem does this PR solve?

When parsing pptx files, some shapes do not contain the `shape_type`
attribute, which causes the original code to throw an exception during
extraction, leading to failure in content extraction. This optimization
introduces handling logic for such anomalous shapes, providing a safer
and more robust processing mechanism.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [x] Performance Improvement
- [ ] Other (please describe):
This commit is contained in:
zhudongwork 2025-04-22 10:16:24 +08:00 committed by GitHub
parent e7f83b13ca
commit 10432a1be7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -31,29 +31,48 @@ class RAGFlowPptParser:
return paragraph.text return paragraph.text
def __extract(self, shape): def __extract(self, shape):
if shape.shape_type == 19: try:
tb = shape.table # First try to get text content
rows = [] if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
for i in range(1, len(tb.rows)): text_frame = shape.text_frame
rows.append("; ".join([tb.cell( texts = []
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) for paragraph in text_frame.paragraphs:
return "\n".join(rows) if paragraph.text.strip():
texts.append(self.__get_bulleted_text(paragraph))
return "\n".join(texts)
if shape.has_text_frame: # Safely get shape_type
text_frame = shape.text_frame try:
texts = [] shape_type = shape.shape_type
for paragraph in text_frame.paragraphs: except NotImplementedError:
if paragraph.text.strip(): # If shape_type is not available, try to get text content
texts.append(self.__get_bulleted_text(paragraph)) if hasattr(shape, 'text'):
return "\n".join(texts) return shape.text.strip()
return ""
if shape.shape_type == 6: # Handle table
texts = [] if shape_type == 19:
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): tb = shape.table
t = self.__extract(p) rows = []
if t: for i in range(1, len(tb.rows)):
texts.append(t) rows.append("; ".join([tb.cell(
return "\n".join(texts) 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)
# Handle group shape
if shape_type == 6:
texts = []
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
t = self.__extract_texts(p)
if t:
texts.append(t)
return "\n".join(texts)
return ""
except Exception as e:
logging.error(f"Error processing shape: {str(e)}")
return ""
def __call__(self, fnm, from_page, to_page, callback=None): def __call__(self, fnm, from_page, to_page, callback=None):
ppt = Presentation(fnm) if isinstance( ppt = Presentation(fnm) if isinstance(