mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 22:49:00 +08:00
parent
ac80c04bd3
commit
1789437cc5
@ -2,7 +2,6 @@ import csv
|
|||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import operator
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections.abc import Mapping, Sequence
|
from collections.abc import Mapping, Sequence
|
||||||
@ -12,6 +11,9 @@ import docx
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pypdfium2 # type: ignore
|
import pypdfium2 # type: ignore
|
||||||
import yaml # type: ignore
|
import yaml # type: ignore
|
||||||
|
from docx.document import Document
|
||||||
|
from docx.oxml.table import CT_Tbl
|
||||||
|
from docx.oxml.text.paragraph import CT_P
|
||||||
from docx.table import Table
|
from docx.table import Table
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
|
|
||||||
@ -231,6 +233,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
|
|||||||
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
|
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def paser_docx_part(block, doc: Document, content_items, i):
|
||||||
|
if isinstance(block, CT_P):
|
||||||
|
content_items.append((i, "paragraph", Paragraph(block, doc)))
|
||||||
|
elif isinstance(block, CT_Tbl):
|
||||||
|
content_items.append((i, "table", Table(block, doc)))
|
||||||
|
|
||||||
|
|
||||||
def _extract_text_from_docx(file_content: bytes) -> str:
|
def _extract_text_from_docx(file_content: bytes) -> str:
|
||||||
"""
|
"""
|
||||||
Extract text from a DOCX file.
|
Extract text from a DOCX file.
|
||||||
@ -244,16 +253,13 @@ def _extract_text_from_docx(file_content: bytes) -> str:
|
|||||||
# Keep track of paragraph and table positions
|
# Keep track of paragraph and table positions
|
||||||
content_items: list[tuple[int, str, Table | Paragraph]] = []
|
content_items: list[tuple[int, str, Table | Paragraph]] = []
|
||||||
|
|
||||||
# Process paragraphs and tables
|
it = iter(doc.element.body)
|
||||||
for i, paragraph in enumerate(doc.paragraphs):
|
part = next(it, None)
|
||||||
if paragraph.text.strip():
|
i = 0
|
||||||
content_items.append((i, "paragraph", paragraph))
|
while part is not None:
|
||||||
|
paser_docx_part(part, doc, content_items, i)
|
||||||
for i, table in enumerate(doc.tables):
|
i = i + 1
|
||||||
content_items.append((i, "table", table))
|
part = next(it, None)
|
||||||
|
|
||||||
# Sort content items based on their original position
|
|
||||||
content_items.sort(key=operator.itemgetter(0))
|
|
||||||
|
|
||||||
# Process sorted content
|
# Process sorted content
|
||||||
for _, item_type, item in content_items:
|
for _, item_type, item in content_items:
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from unittest.mock import Mock, patch
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from docx.oxml.text.paragraph import CT_P
|
||||||
|
|
||||||
from core.file import File, FileTransferMethod
|
from core.file import File, FileTransferMethod
|
||||||
from core.variables import ArrayFileSegment
|
from core.variables import ArrayFileSegment
|
||||||
@ -169,7 +170,12 @@ def test_extract_text_from_docx(mock_document):
|
|||||||
mock_paragraph2 = Mock()
|
mock_paragraph2 = Mock()
|
||||||
mock_paragraph2.text = "Paragraph 2"
|
mock_paragraph2.text = "Paragraph 2"
|
||||||
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
|
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
|
||||||
|
mock_ct_p1 = Mock(spec=CT_P)
|
||||||
|
mock_ct_p1.text = "Paragraph 1"
|
||||||
|
mock_ct_p2 = Mock(spec=CT_P)
|
||||||
|
mock_ct_p2.text = "Paragraph 2"
|
||||||
|
mock_element = Mock(body=[mock_ct_p1, mock_ct_p2])
|
||||||
|
mock_document.return_value.element = mock_element
|
||||||
text = _extract_text_from_docx(b"PK\x03\x04")
|
text = _extract_text_from_docx(b"PK\x03\x04")
|
||||||
assert text == "Paragraph 1\nParagraph 2"
|
assert text == "Paragraph 1\nParagraph 2"
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user