From c837218bc96d5d168ad93c768d137e419e2bc6f2 Mon Sep 17 00:00:00 2001 From: yanghx <30469680+yanghx-git@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:14:24 +0800 Subject: [PATCH] fix #6902 .docx handles images within tables and handles cross-column tables (#6951) --- api/core/rag/extractor/word_extractor.py | 70 +++++++++++++++++++----- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index ac4a56319b..de91363f5f 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -117,19 +117,63 @@ class WordExtractor(BaseExtractor): return image_map - def _table_to_markdown(self, table): - markdown = "" - # deal with table headers - header_row = table.rows[0] - headers = [cell.text for cell in header_row.cells] - markdown += "| " + " | ".join(headers) + " |\n" - markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n" - # deal with table rows - for row in table.rows[1:]: - row_cells = [cell.text for cell in row.cells] - markdown += "| " + " | ".join(row_cells) + " |\n" + def _table_to_markdown(self, table, image_map): + markdown = [] + # calculate the total number of columns + total_cols = max(len(row.cells) for row in table.rows) - return markdown + header_row = table.rows[0] + headers = self._parse_row(header_row, image_map, total_cols) + markdown.append("| " + " | ".join(headers) + " |") + markdown.append("| " + " | ".join(["---"] * total_cols) + " |") + + for row in table.rows[1:]: + row_cells = self._parse_row(row, image_map, total_cols) + markdown.append("| " + " | ".join(row_cells) + " |") + return "\n".join(markdown) + + def _parse_row(self, row, image_map, total_cols): + # Initialize a row, all of which are empty by default + row_cells = [""] * total_cols + col_index = 0 + for cell in row.cells: + # make sure the col_index is not out of range + while col_index < total_cols and row_cells[col_index] != "": + col_index += 1 + # if col_index is out of range the loop is jumped + if col_index >= total_cols: + break + cell_content = self._parse_cell(cell, image_map).strip() + cell_colspan = cell.grid_span if cell.grid_span else 1 + for i in range(cell_colspan): + if col_index + i < total_cols: + row_cells[col_index + i] = cell_content if i == 0 else "" + col_index += cell_colspan + return row_cells + + def _parse_cell(self, cell, image_map): + cell_content = [] + for paragraph in cell.paragraphs: + parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map) + if parsed_paragraph: + cell_content.append(parsed_paragraph) + unique_content = list(dict.fromkeys(cell_content)) + return " ".join(unique_content) + + def _parse_cell_paragraph(self, paragraph, image_map): + paragraph_content = [] + for run in paragraph.runs: + if run.element.xpath('.//a:blip'): + for blip in run.element.xpath('.//a:blip'): + image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed") + image_part = paragraph.part.rels[image_id].target_part + + if image_part in image_map: + image_link = image_map[image_part] + paragraph_content.append(image_link) + else: + paragraph_content.append(run.text) + return "".join(paragraph_content).strip() def _parse_paragraph(self, paragraph, image_map): paragraph_content = [] @@ -183,6 +227,6 @@ class WordExtractor(BaseExtractor): content.append(parsed_paragraph) elif element.tag.endswith('tbl'): # table table = tables.pop(0) - content.append(self._table_to_markdown(table)) + content.append(self._table_to_markdown(table,image_map)) return '\n'.join(content)