mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 16:08:58 +08:00
extract docx filter comment element (#7092)
This commit is contained in:
parent
925f0d2e09
commit
12095f8cd6
@ -228,7 +228,7 @@ class WordExtractor(BaseExtractor):
|
|||||||
def parse_paragraph(paragraph):
|
def parse_paragraph(paragraph):
|
||||||
paragraph_content = []
|
paragraph_content = []
|
||||||
for run in paragraph.runs:
|
for run in paragraph.runs:
|
||||||
if run.element.tag.endswith('r'):
|
if hasattr(run.element, 'tag') and isinstance(element.tag, str) and run.element.tag.endswith('r'):
|
||||||
drawing_elements = run.element.findall(
|
drawing_elements = run.element.findall(
|
||||||
'.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
|
'.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
|
||||||
for drawing in drawing_elements:
|
for drawing in drawing_elements:
|
||||||
@ -248,13 +248,14 @@ class WordExtractor(BaseExtractor):
|
|||||||
paragraphs = doc.paragraphs.copy()
|
paragraphs = doc.paragraphs.copy()
|
||||||
tables = doc.tables.copy()
|
tables = doc.tables.copy()
|
||||||
for element in doc.element.body:
|
for element in doc.element.body:
|
||||||
if element.tag.endswith('p'): # paragraph
|
if hasattr(element, 'tag'):
|
||||||
para = paragraphs.pop(0)
|
if isinstance(element.tag, str) and element.tag.endswith('p'): # paragraph
|
||||||
parsed_paragraph = parse_paragraph(para)
|
para = paragraphs.pop(0)
|
||||||
if parsed_paragraph:
|
parsed_paragraph = parse_paragraph(para)
|
||||||
content.append(parsed_paragraph)
|
if parsed_paragraph:
|
||||||
elif element.tag.endswith('tbl'): # table
|
content.append(parsed_paragraph)
|
||||||
table = tables.pop(0)
|
elif isinstance(element.tag, str) and element.tag.endswith('tbl'): # table
|
||||||
content.append(self._table_to_markdown(table,image_map))
|
table = tables.pop(0)
|
||||||
|
content.append(self._table_to_markdown(table,image_map))
|
||||||
return '\n'.join(content)
|
return '\n'.join(content)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user