clean rag word_extractor. (#19397)

Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>
2025-08-12 23:19:04 +08:00 · 2025-05-09 16:39:16 +08:00 · 2025-05-09 16:39:16 +08:00 · 1119790b02
commit 1119790b02
parent 56cff485d0
1 changed files with 3 additions and 4 deletions
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -76,8 +76,7 @@ class WordExtractor(BaseExtractor):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

-    def _extract_images_from_docx(self, doc, image_folder):
-        os.makedirs(image_folder, exist_ok=True)
+    def _extract_images_from_docx(self, doc):
        image_count = 0
        image_map = {}

@ -210,7 +209,7 @@ class WordExtractor(BaseExtractor):

        content = []

-        image_map = self._extract_images_from_docx(doc, image_folder)
+        image_map = self._extract_images_from_docx(doc)

        hyperlinks_url = None
        url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
@ -225,7 +224,7 @@ class WordExtractor(BaseExtractor):
                        xml = ElementTree.XML(run.element.xml)
                        x_child = [c for c in xml.iter() if c is not None]
                        for x in x_child:
-                            if x_child is None:
+                            if x is None:
                                continue
                            if x.tag.endswith("instrText"):
                                if x.text is None: