From 1119790b027e9c469ad17f739d30df049f3bec79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B9=9B=E9=9C=B2=E5=85=88=E7=94=9F?= Date: Fri, 9 May 2025 16:39:16 +0800 Subject: [PATCH] clean rag word_extractor. (#19397) Signed-off-by: zhanluxianshen --- api/core/rag/extractor/word_extractor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index edaa8c92fa..a4ccdcafd3 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -76,8 +76,7 @@ class WordExtractor(BaseExtractor): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) - def _extract_images_from_docx(self, doc, image_folder): - os.makedirs(image_folder, exist_ok=True) + def _extract_images_from_docx(self, doc): image_count = 0 image_map = {} @@ -210,7 +209,7 @@ class WordExtractor(BaseExtractor): content = [] - image_map = self._extract_images_from_docx(doc, image_folder) + image_map = self._extract_images_from_docx(doc) hyperlinks_url = None url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+") @@ -225,7 +224,7 @@ class WordExtractor(BaseExtractor): xml = ElementTree.XML(run.element.xml) x_child = [c for c in xml.iter() if c is not None] for x in x_child: - if x_child is None: + if x is None: continue if x.tag.endswith("instrText"): if x.text is None: