From b16354577134c5a928fbd8b9dc8b6b9eff9687cb Mon Sep 17 00:00:00 2001
From: Bowen Liang <liangbowen@gf.com.cn>
Date: Thu, 7 Mar 2024 18:24:55 +0800
Subject: [PATCH] Use `python-docx` to extract docx files (#2654)

---
 api/core/rag/extractor/word_extractor.py | 18 ++++++++++--------
 api/requirements.txt                     |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py
index 8e2cd14be7..a41c727f35 100644
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@@ -10,7 +10,7 @@ from core.rag.models.document import Document
 
 
 class WordExtractor(BaseExtractor):
-    """Load pdf files.
+    """Load docx files.
 
 
     Args:
@@ -46,14 +46,16 @@ class WordExtractor(BaseExtractor):
 
     def extract(self) -> list[Document]:
         """Load given path as single page."""
-        import docx2txt
+        from docx import Document as docx_Document
 
-        return [
-            Document(
-                page_content=docx2txt.process(self.file_path),
-                metadata={"source": self.file_path},
-            )
-        ]
+        document = docx_Document(self.file_path)
+        doc_texts = [paragraph.text for paragraph in document.paragraphs]
+        content = '\n'.join(doc_texts)
+
+        return [Document(
+            page_content=content,
+            metadata={"source": self.file_path},
+        )]
 
     @staticmethod
     def _is_valid_url(url: str) -> bool:
diff --git a/api/requirements.txt b/api/requirements.txt
index 9721c3a13d..847903c4f4 100644
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -32,7 +32,7 @@ celery==5.2.7
 redis~=4.5.4
 openpyxl==3.1.2
 chardet~=5.1.0
-docx2txt==0.8
+python-docx~=1.1.0
 pypdfium2==4.16.0
 resend~=0.7.0
 pyjwt~=2.8.0