Refa: Optimize pptx shape extraction to reduce content loss (#6703)

### What problem does this PR solve?

When parsing pptx files, some shapes do not contain the `shape_type`
attribute, which causes the original code to throw an exception during
extraction, leading to failure in content extraction. This optimization
introduces handling logic for such anomalous shapes, providing a safer
and more robust processing mechanism.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [x] Performance Improvement
- [ ] Other (please describe):
This commit is contained in:
zhudongwork 2025-04-22 10:16:24 +08:00 committed by GitHub
parent e7f83b13ca
commit 10432a1be7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -31,29 +31,48 @@ class RAGFlowPptParser:
return paragraph.text
def __extract(self, shape):
if shape.shape_type == 19:
tb = shape.table
rows = []
for i in range(1, len(tb.rows)):
rows.append("; ".join([tb.cell(
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)
try:
# First try to get text content
if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
text_frame = shape.text_frame
texts = []
for paragraph in text_frame.paragraphs:
if paragraph.text.strip():
texts.append(self.__get_bulleted_text(paragraph))
return "\n".join(texts)
if shape.has_text_frame:
text_frame = shape.text_frame
texts = []
for paragraph in text_frame.paragraphs:
if paragraph.text.strip():
texts.append(self.__get_bulleted_text(paragraph))
return "\n".join(texts)
# Safely get shape_type
try:
shape_type = shape.shape_type
except NotImplementedError:
# If shape_type is not available, try to get text content
if hasattr(shape, 'text'):
return shape.text.strip()
return ""
if shape.shape_type == 6:
texts = []
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
t = self.__extract(p)
if t:
texts.append(t)
return "\n".join(texts)
# Handle table
if shape_type == 19:
tb = shape.table
rows = []
for i in range(1, len(tb.rows)):
rows.append("; ".join([tb.cell(
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)
# Handle group shape
if shape_type == 6:
texts = []
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
t = self.__extract_texts(p)
if t:
texts.append(t)
return "\n".join(texts)
return ""
except Exception as e:
logging.error(f"Error processing shape: {str(e)}")
return ""
def __call__(self, fnm, from_page, to_page, callback=None):
ppt = Presentation(fnm) if isinstance(