Fix: Embedding err when docx contains unsupported images (#1720)

### What problem does this PR solve?

Fix the problem of not being able to embedding when docx document
contains unsupported images.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Yuhao Tsui 2024-07-29 19:38:47 +08:00 committed by GitHub
parent 5e19423d82
commit a973b9e01f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -23,7 +23,7 @@ from rag.utils import num_tokens_from_string
from PIL import Image
from functools import reduce
from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError
class Docx(DocxParser):
def __init__(self):
@ -36,9 +36,16 @@ class Docx(DocxParser):
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part = document.part.related_parts[embed]
image = related_part.image
image = Image.open(BytesIO(image.blob)).convert('RGB')
return image
try:
image_blob = related_part.image.blob
except UnrecognizedImageError:
print("Unrecognized image format. Skipping image.")
return None
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
return image
except Exception as e:
return None
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()