Fix: docx get image exception. (#7636)

### What problem does this PR solve?

Close #7631

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2025-05-14 12:24:48 +08:00 committed by GitHub
parent e7a6a9e47e
commit bfe97d896d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 4 additions and 0 deletions

View File

@ -60,6 +60,9 @@ class Docx(DocxParser):
except InvalidImageStreamError: except InvalidImageStreamError:
logging.info("The recognized image stream appears to be corrupted. Skipping image.") logging.info("The recognized image stream appears to be corrupted. Skipping image.")
return None return None
except UnicodeDecodeError:
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
return None
try: try:
image = Image.open(BytesIO(image_blob)).convert('RGB') image = Image.open(BytesIO(image_blob)).convert('RGB')
return image return image

View File

@ -44,6 +44,7 @@ def chunks_format(reference):
"similarity": chunk.get("similarity"), "similarity": chunk.get("similarity"),
"vector_similarity": chunk.get("vector_similarity"), "vector_similarity": chunk.get("vector_similarity"),
"term_similarity": chunk.get("term_similarity"), "term_similarity": chunk.get("term_similarity"),
"doc_type": chunk.get("doc_type_kwd"),
} }
for chunk in reference.get("chunks", []) for chunk in reference.get("chunks", [])
] ]