mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 10:28:59 +08:00
clean rag word_extractor. (#19397)
Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>
This commit is contained in:
parent
56cff485d0
commit
1119790b02
@ -76,8 +76,7 @@ class WordExtractor(BaseExtractor):
|
||||
parsed = urlparse(url)
|
||||
return bool(parsed.netloc) and bool(parsed.scheme)
|
||||
|
||||
def _extract_images_from_docx(self, doc, image_folder):
|
||||
os.makedirs(image_folder, exist_ok=True)
|
||||
def _extract_images_from_docx(self, doc):
|
||||
image_count = 0
|
||||
image_map = {}
|
||||
|
||||
@ -210,7 +209,7 @@ class WordExtractor(BaseExtractor):
|
||||
|
||||
content = []
|
||||
|
||||
image_map = self._extract_images_from_docx(doc, image_folder)
|
||||
image_map = self._extract_images_from_docx(doc)
|
||||
|
||||
hyperlinks_url = None
|
||||
url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
|
||||
@ -225,7 +224,7 @@ class WordExtractor(BaseExtractor):
|
||||
xml = ElementTree.XML(run.element.xml)
|
||||
x_child = [c for c in xml.iter() if c is not None]
|
||||
for x in x_child:
|
||||
if x_child is None:
|
||||
if x is None:
|
||||
continue
|
||||
if x.tag.endswith("instrText"):
|
||||
if x.text is None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user