From d4e0bfc8a5a70611deb2db38037a8c66cbe1f5d3 Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Tue, 16 Apr 2024 19:45:14 +0800 Subject: [PATCH] fix gb2312 encoding issue (#394) ### What problem does this PR solve? Issue link:#384 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/naive.py | 8 +++++--- rag/nlp/search.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/rag/app/naive.py b/rag/app/naive.py index b97cf1573..608cf5611 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -14,8 +14,7 @@ from io import BytesIO from docx import Document import re from deepdoc.parser.pdf_parser import PlainParser -from rag.app import laws -from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks +from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks from deepdoc.parser import PdfParser, ExcelParser, DocxParser from rag.settings import cron_logger @@ -140,7 +139,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.1, "Start to parse.") txt = "" if binary: - txt = binary.decode("utf-8") + try: + txt = binary.decode("utf-8") + except Exception as e: + txt = binary.decode("gb2312") else: with open(filename, "r") as f: while True: diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 422ce54e9..971373cfc 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -237,7 +237,7 @@ class Dealer: pieces_.append(t) es_logger.info("{} => {}".format(answer, pieces_)) if not pieces_: - return answer + return answer, set([]) ans_v, _ = embd_mdl.encode(pieces_) assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(