fix gb2312 encoding issue (#394)

### What problem does this PR solve?

Issue link:#384
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
KevinHuSh 2024-04-16 19:45:14 +08:00 committed by GitHub
parent 044daff668
commit d4e0bfc8a5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 4 deletions

View File

@ -14,8 +14,7 @@ from io import BytesIO
from docx import Document from docx import Document
import re import re
from deepdoc.parser.pdf_parser import PlainParser from deepdoc.parser.pdf_parser import PlainParser
from rag.app import laws from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
from deepdoc.parser import PdfParser, ExcelParser, DocxParser from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from rag.settings import cron_logger from rag.settings import cron_logger
@ -140,7 +139,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = "" txt = ""
if binary: if binary:
try:
txt = binary.decode("utf-8") txt = binary.decode("utf-8")
except Exception as e:
txt = binary.decode("gb2312")
else: else:
with open(filename, "r") as f: with open(filename, "r") as f:
while True: while True:

View File

@ -237,7 +237,7 @@ class Dealer:
pieces_.append(t) pieces_.append(t)
es_logger.info("{} => {}".format(answer, pieces_)) es_logger.info("{} => {}".format(answer, pieces_))
if not pieces_: if not pieces_:
return answer return answer, set([])
ans_v, _ = embd_mdl.encode(pieces_) ans_v, _ = embd_mdl.encode(pieces_)
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(