From bfe97d896d2eb063f32554d051da3803e2c4541c Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Wed, 14 May 2025 12:24:48 +0800 Subject: [PATCH] Fix: docx get image exception. (#7636) ### What problem does this PR solve? Close #7631 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/naive.py | 3 +++ rag/prompts.py | 1 + 2 files changed, 4 insertions(+) diff --git a/rag/app/naive.py b/rag/app/naive.py index e2e4a6a45..28e3bbbcc 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -60,6 +60,9 @@ class Docx(DocxParser): except InvalidImageStreamError: logging.info("The recognized image stream appears to be corrupted. Skipping image.") return None + except UnicodeDecodeError: + logging.info("The recognized image stream appears to be corrupted. Skipping image.") + return None try: image = Image.open(BytesIO(image_blob)).convert('RGB') return image diff --git a/rag/prompts.py b/rag/prompts.py index 4a61de557..cb1e1108b 100644 --- a/rag/prompts.py +++ b/rag/prompts.py @@ -44,6 +44,7 @@ def chunks_format(reference): "similarity": chunk.get("similarity"), "vector_similarity": chunk.get("vector_similarity"), "term_similarity": chunk.get("term_similarity"), + "doc_type": chunk.get("doc_type_kwd"), } for chunk in reference.get("chunks", []) ]