Refa: Improve ppt_parser better handle list (#6162)

### What problem does this PR solve?
This pull request (PR) incorporates codes for parsing PPTX files, aiming
to more precisely depict text in list formats (hint list by .).

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
This commit is contained in:
Stephen Hu 2025-03-17 17:02:39 +08:00 committed by GitHub
parent 3a99c2b5f4
commit 79482ff672
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -23,6 +23,13 @@ class RAGFlowPptParser:
def __init__(self):
super().__init__()
def __get_bulleted_text(self, paragraph):
is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) )
if is_bulleted:
return f"{' '* paragraph.level}.{paragraph.text}"
else:
return paragraph.text
def __extract(self, shape):
if shape.shape_type == 19:
tb = shape.table
@ -33,7 +40,12 @@ class RAGFlowPptParser:
return "\n".join(rows)
if shape.has_text_frame:
return shape.text_frame.text
text_frame = shape.text_frame
texts = []
for paragraph in text_frame.paragraphs:
if paragraph.text.strip():
texts.append(self.__get_bulleted_text(paragraph))
return "\n".join(texts)
if shape.shape_type == 6:
texts = []
@ -65,4 +77,4 @@ class RAGFlowPptParser:
logging.exception(e)
txts.append("\n".join(texts))
return txts
return txts