mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-20 05:39:07 +08:00
This commit is contained in:
parent
68552893ef
commit
c837218bc9
@ -117,19 +117,63 @@ class WordExtractor(BaseExtractor):
|
|||||||
|
|
||||||
return image_map
|
return image_map
|
||||||
|
|
||||||
def _table_to_markdown(self, table):
|
def _table_to_markdown(self, table, image_map):
|
||||||
markdown = ""
|
markdown = []
|
||||||
# deal with table headers
|
# calculate the total number of columns
|
||||||
header_row = table.rows[0]
|
total_cols = max(len(row.cells) for row in table.rows)
|
||||||
headers = [cell.text for cell in header_row.cells]
|
|
||||||
markdown += "| " + " | ".join(headers) + " |\n"
|
|
||||||
markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n"
|
|
||||||
# deal with table rows
|
|
||||||
for row in table.rows[1:]:
|
|
||||||
row_cells = [cell.text for cell in row.cells]
|
|
||||||
markdown += "| " + " | ".join(row_cells) + " |\n"
|
|
||||||
|
|
||||||
return markdown
|
header_row = table.rows[0]
|
||||||
|
headers = self._parse_row(header_row, image_map, total_cols)
|
||||||
|
markdown.append("| " + " | ".join(headers) + " |")
|
||||||
|
markdown.append("| " + " | ".join(["---"] * total_cols) + " |")
|
||||||
|
|
||||||
|
for row in table.rows[1:]:
|
||||||
|
row_cells = self._parse_row(row, image_map, total_cols)
|
||||||
|
markdown.append("| " + " | ".join(row_cells) + " |")
|
||||||
|
return "\n".join(markdown)
|
||||||
|
|
||||||
|
def _parse_row(self, row, image_map, total_cols):
|
||||||
|
# Initialize a row, all of which are empty by default
|
||||||
|
row_cells = [""] * total_cols
|
||||||
|
col_index = 0
|
||||||
|
for cell in row.cells:
|
||||||
|
# make sure the col_index is not out of range
|
||||||
|
while col_index < total_cols and row_cells[col_index] != "":
|
||||||
|
col_index += 1
|
||||||
|
# if col_index is out of range the loop is jumped
|
||||||
|
if col_index >= total_cols:
|
||||||
|
break
|
||||||
|
cell_content = self._parse_cell(cell, image_map).strip()
|
||||||
|
cell_colspan = cell.grid_span if cell.grid_span else 1
|
||||||
|
for i in range(cell_colspan):
|
||||||
|
if col_index + i < total_cols:
|
||||||
|
row_cells[col_index + i] = cell_content if i == 0 else ""
|
||||||
|
col_index += cell_colspan
|
||||||
|
return row_cells
|
||||||
|
|
||||||
|
def _parse_cell(self, cell, image_map):
|
||||||
|
cell_content = []
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map)
|
||||||
|
if parsed_paragraph:
|
||||||
|
cell_content.append(parsed_paragraph)
|
||||||
|
unique_content = list(dict.fromkeys(cell_content))
|
||||||
|
return " ".join(unique_content)
|
||||||
|
|
||||||
|
def _parse_cell_paragraph(self, paragraph, image_map):
|
||||||
|
paragraph_content = []
|
||||||
|
for run in paragraph.runs:
|
||||||
|
if run.element.xpath('.//a:blip'):
|
||||||
|
for blip in run.element.xpath('.//a:blip'):
|
||||||
|
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
|
||||||
|
image_part = paragraph.part.rels[image_id].target_part
|
||||||
|
|
||||||
|
if image_part in image_map:
|
||||||
|
image_link = image_map[image_part]
|
||||||
|
paragraph_content.append(image_link)
|
||||||
|
else:
|
||||||
|
paragraph_content.append(run.text)
|
||||||
|
return "".join(paragraph_content).strip()
|
||||||
|
|
||||||
def _parse_paragraph(self, paragraph, image_map):
|
def _parse_paragraph(self, paragraph, image_map):
|
||||||
paragraph_content = []
|
paragraph_content = []
|
||||||
@ -183,6 +227,6 @@ class WordExtractor(BaseExtractor):
|
|||||||
content.append(parsed_paragraph)
|
content.append(parsed_paragraph)
|
||||||
elif element.tag.endswith('tbl'): # table
|
elif element.tag.endswith('tbl'): # table
|
||||||
table = tables.pop(0)
|
table = tables.pop(0)
|
||||||
content.append(self._table_to_markdown(table))
|
content.append(self._table_to_markdown(table,image_map))
|
||||||
return '\n'.join(content)
|
return '\n'.join(content)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user