diff --git a/rag/app/naive.py b/rag/app/naive.py index ccb357e5e..a2ba6993c 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -23,7 +23,7 @@ from rag.utils import num_tokens_from_string from PIL import Image from functools import reduce from markdown import markdown - +from docx.image.exceptions import UnrecognizedImageError class Docx(DocxParser): def __init__(self): @@ -36,9 +36,16 @@ class Docx(DocxParser): img = img[0] embed = img.xpath('.//a:blip/@r:embed')[0] related_part = document.part.related_parts[embed] - image = related_part.image - image = Image.open(BytesIO(image.blob)).convert('RGB') - return image + try: + image_blob = related_part.image.blob + except UnrecognizedImageError: + print("Unrecognized image format. Skipping image.") + return None + try: + image = Image.open(BytesIO(image_blob)).convert('RGB') + return image + except Exception as e: + return None def __clean(self, line): line = re.sub(r"\u3000", " ", line).strip()