clean rag word_extractor. (#19397)

Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>
This commit is contained in:
湛露先生 2025-05-09 16:39:16 +08:00 committed by GitHub
parent 56cff485d0
commit 1119790b02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -76,8 +76,7 @@ class WordExtractor(BaseExtractor):
parsed = urlparse(url) parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme) return bool(parsed.netloc) and bool(parsed.scheme)
def _extract_images_from_docx(self, doc, image_folder): def _extract_images_from_docx(self, doc):
os.makedirs(image_folder, exist_ok=True)
image_count = 0 image_count = 0
image_map = {} image_map = {}
@ -210,7 +209,7 @@ class WordExtractor(BaseExtractor):
content = [] content = []
image_map = self._extract_images_from_docx(doc, image_folder) image_map = self._extract_images_from_docx(doc)
hyperlinks_url = None hyperlinks_url = None
url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+") url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
@ -225,7 +224,7 @@ class WordExtractor(BaseExtractor):
xml = ElementTree.XML(run.element.xml) xml = ElementTree.XML(run.element.xml)
x_child = [c for c in xml.iter() if c is not None] x_child = [c for c in xml.iter() if c is not None]
for x in x_child: for x in x_child:
if x_child is None: if x is None:
continue continue
if x.tag.endswith("instrText"): if x.tag.endswith("instrText"):
if x.text is None: if x.text is None: