Fix: Embedding err when docx contains unsupported images (#1720)

### What problem does this PR solve?

Fix the problem of not being able to embedding when docx document
contains unsupported images.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Yuhao Tsui 2024-07-29 19:38:47 +08:00 committed by GitHub
parent 5e19423d82
commit a973b9e01f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -23,7 +23,7 @@ from rag.utils import num_tokens_from_string
from PIL import Image from PIL import Image
from functools import reduce from functools import reduce
from markdown import markdown from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError
class Docx(DocxParser): class Docx(DocxParser):
def __init__(self): def __init__(self):
@ -36,9 +36,16 @@ class Docx(DocxParser):
img = img[0] img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0] embed = img.xpath('.//a:blip/@r:embed')[0]
related_part = document.part.related_parts[embed] related_part = document.part.related_parts[embed]
image = related_part.image try:
image = Image.open(BytesIO(image.blob)).convert('RGB') image_blob = related_part.image.blob
return image except UnrecognizedImageError:
print("Unrecognized image format. Skipping image.")
return None
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
return image
except Exception as e:
return None
def __clean(self, line): def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip() line = re.sub(r"\u3000", " ", line).strip()