mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 05:55:59 +08:00
Refa: Optimize pptx shape extraction to reduce content loss (#6703)
### What problem does this PR solve? When parsing pptx files, some shapes do not contain the `shape_type` attribute, which causes the original code to throw an exception during extraction, leading to failure in content extraction. This optimization introduces handling logic for such anomalous shapes, providing a safer and more robust processing mechanism. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [x] Performance Improvement - [ ] Other (please describe):
This commit is contained in:
parent
e7f83b13ca
commit
10432a1be7
@ -31,29 +31,48 @@ class RAGFlowPptParser:
|
|||||||
return paragraph.text
|
return paragraph.text
|
||||||
|
|
||||||
def __extract(self, shape):
|
def __extract(self, shape):
|
||||||
if shape.shape_type == 19:
|
try:
|
||||||
tb = shape.table
|
# First try to get text content
|
||||||
rows = []
|
if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
|
||||||
for i in range(1, len(tb.rows)):
|
text_frame = shape.text_frame
|
||||||
rows.append("; ".join([tb.cell(
|
texts = []
|
||||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
for paragraph in text_frame.paragraphs:
|
||||||
return "\n".join(rows)
|
if paragraph.text.strip():
|
||||||
|
texts.append(self.__get_bulleted_text(paragraph))
|
||||||
|
return "\n".join(texts)
|
||||||
|
|
||||||
if shape.has_text_frame:
|
# Safely get shape_type
|
||||||
text_frame = shape.text_frame
|
try:
|
||||||
texts = []
|
shape_type = shape.shape_type
|
||||||
for paragraph in text_frame.paragraphs:
|
except NotImplementedError:
|
||||||
if paragraph.text.strip():
|
# If shape_type is not available, try to get text content
|
||||||
texts.append(self.__get_bulleted_text(paragraph))
|
if hasattr(shape, 'text'):
|
||||||
return "\n".join(texts)
|
return shape.text.strip()
|
||||||
|
return ""
|
||||||
|
|
||||||
if shape.shape_type == 6:
|
# Handle table
|
||||||
texts = []
|
if shape_type == 19:
|
||||||
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
tb = shape.table
|
||||||
t = self.__extract(p)
|
rows = []
|
||||||
if t:
|
for i in range(1, len(tb.rows)):
|
||||||
texts.append(t)
|
rows.append("; ".join([tb.cell(
|
||||||
return "\n".join(texts)
|
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||||
|
return "\n".join(rows)
|
||||||
|
|
||||||
|
# Handle group shape
|
||||||
|
if shape_type == 6:
|
||||||
|
texts = []
|
||||||
|
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
||||||
|
t = self.__extract_texts(p)
|
||||||
|
if t:
|
||||||
|
texts.append(t)
|
||||||
|
return "\n".join(texts)
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error processing shape: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||||
ppt = Presentation(fnm) if isinstance(
|
ppt = Presentation(fnm) if isinstance(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user