mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-19 15:09:11 +08:00
This commit is contained in:
parent
68552893ef
commit
c837218bc9
@ -117,19 +117,63 @@ class WordExtractor(BaseExtractor):
|
||||
|
||||
return image_map
|
||||
|
||||
def _table_to_markdown(self, table):
|
||||
markdown = ""
|
||||
# deal with table headers
|
||||
header_row = table.rows[0]
|
||||
headers = [cell.text for cell in header_row.cells]
|
||||
markdown += "| " + " | ".join(headers) + " |\n"
|
||||
markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n"
|
||||
# deal with table rows
|
||||
for row in table.rows[1:]:
|
||||
row_cells = [cell.text for cell in row.cells]
|
||||
markdown += "| " + " | ".join(row_cells) + " |\n"
|
||||
def _table_to_markdown(self, table, image_map):
|
||||
markdown = []
|
||||
# calculate the total number of columns
|
||||
total_cols = max(len(row.cells) for row in table.rows)
|
||||
|
||||
return markdown
|
||||
header_row = table.rows[0]
|
||||
headers = self._parse_row(header_row, image_map, total_cols)
|
||||
markdown.append("| " + " | ".join(headers) + " |")
|
||||
markdown.append("| " + " | ".join(["---"] * total_cols) + " |")
|
||||
|
||||
for row in table.rows[1:]:
|
||||
row_cells = self._parse_row(row, image_map, total_cols)
|
||||
markdown.append("| " + " | ".join(row_cells) + " |")
|
||||
return "\n".join(markdown)
|
||||
|
||||
def _parse_row(self, row, image_map, total_cols):
|
||||
# Initialize a row, all of which are empty by default
|
||||
row_cells = [""] * total_cols
|
||||
col_index = 0
|
||||
for cell in row.cells:
|
||||
# make sure the col_index is not out of range
|
||||
while col_index < total_cols and row_cells[col_index] != "":
|
||||
col_index += 1
|
||||
# if col_index is out of range the loop is jumped
|
||||
if col_index >= total_cols:
|
||||
break
|
||||
cell_content = self._parse_cell(cell, image_map).strip()
|
||||
cell_colspan = cell.grid_span if cell.grid_span else 1
|
||||
for i in range(cell_colspan):
|
||||
if col_index + i < total_cols:
|
||||
row_cells[col_index + i] = cell_content if i == 0 else ""
|
||||
col_index += cell_colspan
|
||||
return row_cells
|
||||
|
||||
def _parse_cell(self, cell, image_map):
|
||||
cell_content = []
|
||||
for paragraph in cell.paragraphs:
|
||||
parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map)
|
||||
if parsed_paragraph:
|
||||
cell_content.append(parsed_paragraph)
|
||||
unique_content = list(dict.fromkeys(cell_content))
|
||||
return " ".join(unique_content)
|
||||
|
||||
def _parse_cell_paragraph(self, paragraph, image_map):
|
||||
paragraph_content = []
|
||||
for run in paragraph.runs:
|
||||
if run.element.xpath('.//a:blip'):
|
||||
for blip in run.element.xpath('.//a:blip'):
|
||||
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
|
||||
image_part = paragraph.part.rels[image_id].target_part
|
||||
|
||||
if image_part in image_map:
|
||||
image_link = image_map[image_part]
|
||||
paragraph_content.append(image_link)
|
||||
else:
|
||||
paragraph_content.append(run.text)
|
||||
return "".join(paragraph_content).strip()
|
||||
|
||||
def _parse_paragraph(self, paragraph, image_map):
|
||||
paragraph_content = []
|
||||
@ -183,6 +227,6 @@ class WordExtractor(BaseExtractor):
|
||||
content.append(parsed_paragraph)
|
||||
elif element.tag.endswith('tbl'): # table
|
||||
table = tables.pop(0)
|
||||
content.append(self._table_to_markdown(table))
|
||||
content.append(self._table_to_markdown(table,image_map))
|
||||
return '\n'.join(content)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user