reopen PR for #14411 (#16148)

This commit is contained in:
cyflhn 2025-03-19 10:24:35 +08:00 committed by GitHub
parent ac80c04bd3
commit 1789437cc5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 24 additions and 12 deletions

View File

@ -2,7 +2,6 @@ import csv
import io
import json
import logging
import operator
import os
import tempfile
from collections.abc import Mapping, Sequence
@ -12,6 +11,9 @@ import docx
import pandas as pd
import pypdfium2 # type: ignore
import yaml # type: ignore
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph
@ -231,6 +233,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
def paser_docx_part(block, doc: Document, content_items, i):
if isinstance(block, CT_P):
content_items.append((i, "paragraph", Paragraph(block, doc)))
elif isinstance(block, CT_Tbl):
content_items.append((i, "table", Table(block, doc)))
def _extract_text_from_docx(file_content: bytes) -> str:
"""
Extract text from a DOCX file.
@ -244,16 +253,13 @@ def _extract_text_from_docx(file_content: bytes) -> str:
# Keep track of paragraph and table positions
content_items: list[tuple[int, str, Table | Paragraph]] = []
# Process paragraphs and tables
for i, paragraph in enumerate(doc.paragraphs):
if paragraph.text.strip():
content_items.append((i, "paragraph", paragraph))
for i, table in enumerate(doc.tables):
content_items.append((i, "table", table))
# Sort content items based on their original position
content_items.sort(key=operator.itemgetter(0))
it = iter(doc.element.body)
part = next(it, None)
i = 0
while part is not None:
paser_docx_part(part, doc, content_items, i)
i = i + 1
part = next(it, None)
# Process sorted content
for _, item_type, item in content_items:

View File

@ -1,6 +1,7 @@
from unittest.mock import Mock, patch
import pytest
from docx.oxml.text.paragraph import CT_P
from core.file import File, FileTransferMethod
from core.variables import ArrayFileSegment
@ -169,7 +170,12 @@ def test_extract_text_from_docx(mock_document):
mock_paragraph2 = Mock()
mock_paragraph2.text = "Paragraph 2"
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
mock_ct_p1 = Mock(spec=CT_P)
mock_ct_p1.text = "Paragraph 1"
mock_ct_p2 = Mock(spec=CT_P)
mock_ct_p2.text = "Paragraph 2"
mock_element = Mock(body=[mock_ct_p1, mock_ct_p2])
mock_document.return_value.element = mock_element
text = _extract_text_from_docx(b"PK\x03\x04")
assert text == "Paragraph 1\nParagraph 2"