fix #6902 .docx handles images within tables and handles cross-column tables (#6951)

This commit is contained in:
yanghx 2024-08-06 17:14:24 +08:00 committed by Joel
parent 68552893ef
commit c837218bc9

View File

@ -117,19 +117,63 @@ class WordExtractor(BaseExtractor):
return image_map
def _table_to_markdown(self, table):
markdown = ""
# deal with table headers
header_row = table.rows[0]
headers = [cell.text for cell in header_row.cells]
markdown += "| " + " | ".join(headers) + " |\n"
markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n"
# deal with table rows
for row in table.rows[1:]:
row_cells = [cell.text for cell in row.cells]
markdown += "| " + " | ".join(row_cells) + " |\n"
def _table_to_markdown(self, table, image_map):
markdown = []
# calculate the total number of columns
total_cols = max(len(row.cells) for row in table.rows)
return markdown
header_row = table.rows[0]
headers = self._parse_row(header_row, image_map, total_cols)
markdown.append("| " + " | ".join(headers) + " |")
markdown.append("| " + " | ".join(["---"] * total_cols) + " |")
for row in table.rows[1:]:
row_cells = self._parse_row(row, image_map, total_cols)
markdown.append("| " + " | ".join(row_cells) + " |")
return "\n".join(markdown)
def _parse_row(self, row, image_map, total_cols):
# Initialize a row, all of which are empty by default
row_cells = [""] * total_cols
col_index = 0
for cell in row.cells:
# make sure the col_index is not out of range
while col_index < total_cols and row_cells[col_index] != "":
col_index += 1
# if col_index is out of range the loop is jumped
if col_index >= total_cols:
break
cell_content = self._parse_cell(cell, image_map).strip()
cell_colspan = cell.grid_span if cell.grid_span else 1
for i in range(cell_colspan):
if col_index + i < total_cols:
row_cells[col_index + i] = cell_content if i == 0 else ""
col_index += cell_colspan
return row_cells
def _parse_cell(self, cell, image_map):
cell_content = []
for paragraph in cell.paragraphs:
parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map)
if parsed_paragraph:
cell_content.append(parsed_paragraph)
unique_content = list(dict.fromkeys(cell_content))
return " ".join(unique_content)
def _parse_cell_paragraph(self, paragraph, image_map):
paragraph_content = []
for run in paragraph.runs:
if run.element.xpath('.//a:blip'):
for blip in run.element.xpath('.//a:blip'):
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
image_part = paragraph.part.rels[image_id].target_part
if image_part in image_map:
image_link = image_map[image_part]
paragraph_content.append(image_link)
else:
paragraph_content.append(run.text)
return "".join(paragraph_content).strip()
def _parse_paragraph(self, paragraph, image_map):
paragraph_content = []
@ -183,6 +227,6 @@ class WordExtractor(BaseExtractor):
content.append(parsed_paragraph)
elif element.tag.endswith('tbl'): # table
table = tables.pop(0)
content.append(self._table_to_markdown(table))
content.append(self._table_to_markdown(table,image_map))
return '\n'.join(content)