mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-06-04 11:24:00 +08:00
Fix: Embedding err when docx contains unsupported images (#1720)
### What problem does this PR solve? Fix the problem of not being able to embedding when docx document contains unsupported images. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
parent
5e19423d82
commit
a973b9e01f
@ -23,7 +23,7 @@ from rag.utils import num_tokens_from_string
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from markdown import markdown
|
from markdown import markdown
|
||||||
|
from docx.image.exceptions import UnrecognizedImageError
|
||||||
|
|
||||||
class Docx(DocxParser):
|
class Docx(DocxParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -36,9 +36,16 @@ class Docx(DocxParser):
|
|||||||
img = img[0]
|
img = img[0]
|
||||||
embed = img.xpath('.//a:blip/@r:embed')[0]
|
embed = img.xpath('.//a:blip/@r:embed')[0]
|
||||||
related_part = document.part.related_parts[embed]
|
related_part = document.part.related_parts[embed]
|
||||||
image = related_part.image
|
try:
|
||||||
image = Image.open(BytesIO(image.blob)).convert('RGB')
|
image_blob = related_part.image.blob
|
||||||
return image
|
except UnrecognizedImageError:
|
||||||
|
print("Unrecognized image format. Skipping image.")
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
||||||
|
return image
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
|
||||||
def __clean(self, line):
|
def __clean(self, line):
|
||||||
line = re.sub(r"\u3000", " ", line).strip()
|
line = re.sub(r"\u3000", " ", line).strip()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user