From b16354577134c5a928fbd8b9dc8b6b9eff9687cb Mon Sep 17 00:00:00 2001 From: Bowen Liang Date: Thu, 7 Mar 2024 18:24:55 +0800 Subject: [PATCH] Use `python-docx` to extract docx files (#2654) --- api/core/rag/extractor/word_extractor.py | 18 ++++++++++-------- api/requirements.txt | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 8e2cd14be7..a41c727f35 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -10,7 +10,7 @@ from core.rag.models.document import Document class WordExtractor(BaseExtractor): - """Load pdf files. + """Load docx files. Args: @@ -46,14 +46,16 @@ class WordExtractor(BaseExtractor): def extract(self) -> list[Document]: """Load given path as single page.""" - import docx2txt + from docx import Document as docx_Document - return [ - Document( - page_content=docx2txt.process(self.file_path), - metadata={"source": self.file_path}, - ) - ] + document = docx_Document(self.file_path) + doc_texts = [paragraph.text for paragraph in document.paragraphs] + content = '\n'.join(doc_texts) + + return [Document( + page_content=content, + metadata={"source": self.file_path}, + )] @staticmethod def _is_valid_url(url: str) -> bool: diff --git a/api/requirements.txt b/api/requirements.txt index 9721c3a13d..847903c4f4 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -32,7 +32,7 @@ celery==5.2.7 redis~=4.5.4 openpyxl==3.1.2 chardet~=5.1.0 -docx2txt==0.8 +python-docx~=1.1.0 pypdfium2==4.16.0 resend~=0.7.0 pyjwt~=2.8.0