reopen PR for #14411 (#16148)

This commit is contained in:
cyflhn 2025-03-19 10:24:35 +08:00 committed by GitHub
parent ac80c04bd3
commit 1789437cc5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 24 additions and 12 deletions

View File

@ -2,7 +2,6 @@ import csv
import io import io
import json import json
import logging import logging
import operator
import os import os
import tempfile import tempfile
from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
@ -12,6 +11,9 @@ import docx
import pandas as pd import pandas as pd
import pypdfium2 # type: ignore import pypdfium2 # type: ignore
import yaml # type: ignore import yaml # type: ignore
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table from docx.table import Table
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
@ -231,6 +233,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
def paser_docx_part(block, doc: Document, content_items, i):
if isinstance(block, CT_P):
content_items.append((i, "paragraph", Paragraph(block, doc)))
elif isinstance(block, CT_Tbl):
content_items.append((i, "table", Table(block, doc)))
def _extract_text_from_docx(file_content: bytes) -> str: def _extract_text_from_docx(file_content: bytes) -> str:
""" """
Extract text from a DOCX file. Extract text from a DOCX file.
@ -244,16 +253,13 @@ def _extract_text_from_docx(file_content: bytes) -> str:
# Keep track of paragraph and table positions # Keep track of paragraph and table positions
content_items: list[tuple[int, str, Table | Paragraph]] = [] content_items: list[tuple[int, str, Table | Paragraph]] = []
# Process paragraphs and tables it = iter(doc.element.body)
for i, paragraph in enumerate(doc.paragraphs): part = next(it, None)
if paragraph.text.strip(): i = 0
content_items.append((i, "paragraph", paragraph)) while part is not None:
paser_docx_part(part, doc, content_items, i)
for i, table in enumerate(doc.tables): i = i + 1
content_items.append((i, "table", table)) part = next(it, None)
# Sort content items based on their original position
content_items.sort(key=operator.itemgetter(0))
# Process sorted content # Process sorted content
for _, item_type, item in content_items: for _, item_type, item in content_items:

View File

@ -1,6 +1,7 @@
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
import pytest import pytest
from docx.oxml.text.paragraph import CT_P
from core.file import File, FileTransferMethod from core.file import File, FileTransferMethod
from core.variables import ArrayFileSegment from core.variables import ArrayFileSegment
@ -169,7 +170,12 @@ def test_extract_text_from_docx(mock_document):
mock_paragraph2 = Mock() mock_paragraph2 = Mock()
mock_paragraph2.text = "Paragraph 2" mock_paragraph2.text = "Paragraph 2"
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2] mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
mock_ct_p1 = Mock(spec=CT_P)
mock_ct_p1.text = "Paragraph 1"
mock_ct_p2 = Mock(spec=CT_P)
mock_ct_p2.text = "Paragraph 2"
mock_element = Mock(body=[mock_ct_p1, mock_ct_p2])
mock_document.return_value.element = mock_element
text = _extract_text_from_docx(b"PK\x03\x04") text = _extract_text_from_docx(b"PK\x03\x04")
assert text == "Paragraph 1\nParagraph 2" assert text == "Paragraph 1\nParagraph 2"