From 7eee193956b686c941cc80d2393ab04b0e65d87a Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Tue, 28 May 2024 11:13:02 +0800 Subject: [PATCH] fix #917 #915 (#946) ### What problem does this PR solve? #917 #915 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/pdf_parser.py | 2 +- rag/app/naive.py | 14 +++++++++++--- rag/nlp/rag_tokenizer.py | 2 +- requirements.txt | 1 + requirements_dev.txt | 1 + 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 5068eed62..4e43df7a3 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -392,7 +392,7 @@ class RAGFlowPdfParser: b["text"].strip()[-1] in ",;:'\",、‘“;:-", len(b["text"].strip()) > 1 and b["text"].strip( )[-2] in ",;:'\",‘“、;:", - b["text"].strip()[0] in "。;?!?”)),,、:", + b_["text"].strip()[0] in "。;?!?”)),,、:", ] # features for not concating feats = [ diff --git a/rag/app/naive.py b/rag/app/naive.py index f91734b27..f1e3e3bdb 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -19,6 +19,8 @@ from deepdoc.parser.pdf_parser import PlainParser from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec from deepdoc.parser import PdfParser, ExcelParser, DocxParser from rag.settings import cron_logger +from rag.utils import num_tokens_from_string + class Docx(DocxParser): def __init__(self): @@ -149,8 +151,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if not l: break txt += l - sections = txt.split("\n") - sections = [(l, "") for l in sections if l] + sections = [] + for sec in txt.split("\n"): + if num_tokens_from_string(sec) > 10 * parser_config.get("chunk_token_num", 128): + sections.append((sec[:int(len(sec)/2)], "")) + sections.append((sec[int(len(sec)/2):], "")) + else: + sections.append((sec, "")) + callback(0.8, "Finish parsing.") elif re.search(r"\.doc$", filename, re.IGNORECASE): @@ -163,7 +171,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, else: raise NotImplementedError( - "file type not supported yet(doc, docx, pdf, txt supported)") + "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") st = timer() chunks = naive_merge( diff --git a/rag/nlp/rag_tokenizer.py b/rag/nlp/rag_tokenizer.py index abd9146a9..be5b724b9 100644 --- a/rag/nlp/rag_tokenizer.py +++ b/rag/nlp/rag_tokenizer.py @@ -24,7 +24,7 @@ class RagTokenizer: def loadDict_(self, fnm): print("[HUQIE]:Build trie", fnm, file=sys.stderr) try: - of = open(fnm, "r") + of = open(fnm, "r", encoding='utf-8') while True: line = of.readline() if not line: diff --git a/requirements.txt b/requirements.txt index f242a2ce3..c8c3cd25d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -136,3 +136,4 @@ BCEmbedding loguru==0.7.2 umap-learn fasttext==0.9.2 +volcengine diff --git a/requirements_dev.txt b/requirements_dev.txt index f29022bca..ae0ba7a34 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -124,3 +124,4 @@ ollama==0.1.8 redis==5.0.4 fasttext==0.9.2 umap-learn +volcengine