diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 07abe345dd..0964b8b718 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -2,7 +2,6 @@ import csv import io import json import logging -import operator import os import tempfile from collections.abc import Mapping, Sequence @@ -12,6 +11,9 @@ import docx import pandas as pd import pypdfium2 # type: ignore import yaml # type: ignore +from docx.document import Document +from docx.oxml.table import CT_Tbl +from docx.oxml.text.paragraph import CT_P from docx.table import Table from docx.text.paragraph import Paragraph @@ -231,6 +233,13 @@ def _extract_text_from_doc(file_content: bytes) -> str: raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e +def paser_docx_part(block, doc: Document, content_items, i): + if isinstance(block, CT_P): + content_items.append((i, "paragraph", Paragraph(block, doc))) + elif isinstance(block, CT_Tbl): + content_items.append((i, "table", Table(block, doc))) + + def _extract_text_from_docx(file_content: bytes) -> str: """ Extract text from a DOCX file. @@ -244,16 +253,13 @@ def _extract_text_from_docx(file_content: bytes) -> str: # Keep track of paragraph and table positions content_items: list[tuple[int, str, Table | Paragraph]] = [] - # Process paragraphs and tables - for i, paragraph in enumerate(doc.paragraphs): - if paragraph.text.strip(): - content_items.append((i, "paragraph", paragraph)) - - for i, table in enumerate(doc.tables): - content_items.append((i, "table", table)) - - # Sort content items based on their original position - content_items.sort(key=operator.itemgetter(0)) + it = iter(doc.element.body) + part = next(it, None) + i = 0 + while part is not None: + paser_docx_part(part, doc, content_items, i) + i = i + 1 + part = next(it, None) # Process sorted content for _, item_type, item in content_items: diff --git a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py index 5dfdfc0ebd..de739ce9f5 100644 --- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py +++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py @@ -1,6 +1,7 @@ from unittest.mock import Mock, patch import pytest +from docx.oxml.text.paragraph import CT_P from core.file import File, FileTransferMethod from core.variables import ArrayFileSegment @@ -169,7 +170,12 @@ def test_extract_text_from_docx(mock_document): mock_paragraph2 = Mock() mock_paragraph2.text = "Paragraph 2" mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2] - + mock_ct_p1 = Mock(spec=CT_P) + mock_ct_p1.text = "Paragraph 1" + mock_ct_p2 = Mock(spec=CT_P) + mock_ct_p2.text = "Paragraph 2" + mock_element = Mock(body=[mock_ct_p1, mock_ct_p2]) + mock_document.return_value.element = mock_element text = _extract_text_from_docx(b"PK\x03\x04") assert text == "Paragraph 1\nParagraph 2"