mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 21:39:00 +08:00
Refa: Optimize pptx shape extraction to reduce content loss (#6703)
### What problem does this PR solve? When parsing pptx files, some shapes do not contain the `shape_type` attribute, which causes the original code to throw an exception during extraction, leading to failure in content extraction. This optimization introduces handling logic for such anomalous shapes, providing a safer and more robust processing mechanism. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [x] Performance Improvement - [ ] Other (please describe):
This commit is contained in:
parent
e7f83b13ca
commit
10432a1be7
@ -31,29 +31,48 @@ class RAGFlowPptParser:
|
||||
return paragraph.text
|
||||
|
||||
def __extract(self, shape):
|
||||
if shape.shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
try:
|
||||
# First try to get text content
|
||||
if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
|
||||
text_frame = shape.text_frame
|
||||
texts = []
|
||||
for paragraph in text_frame.paragraphs:
|
||||
if paragraph.text.strip():
|
||||
texts.append(self.__get_bulleted_text(paragraph))
|
||||
return "\n".join(texts)
|
||||
|
||||
if shape.has_text_frame:
|
||||
text_frame = shape.text_frame
|
||||
texts = []
|
||||
for paragraph in text_frame.paragraphs:
|
||||
if paragraph.text.strip():
|
||||
texts.append(self.__get_bulleted_text(paragraph))
|
||||
return "\n".join(texts)
|
||||
# Safely get shape_type
|
||||
try:
|
||||
shape_type = shape.shape_type
|
||||
except NotImplementedError:
|
||||
# If shape_type is not available, try to get text content
|
||||
if hasattr(shape, 'text'):
|
||||
return shape.text.strip()
|
||||
return ""
|
||||
|
||||
if shape.shape_type == 6:
|
||||
texts = []
|
||||
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
||||
t = self.__extract(p)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
# Handle table
|
||||
if shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
# Handle group shape
|
||||
if shape_type == 6:
|
||||
texts = []
|
||||
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
||||
t = self.__extract_texts(p)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing shape: {str(e)}")
|
||||
return ""
|
||||
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
|
Loading…
x
Reference in New Issue
Block a user