diff --git a/deepdoc/parser/txt_parser.py b/deepdoc/parser/txt_parser.py index bb61005f4..8a322a128 100644 --- a/deepdoc/parser/txt_parser.py +++ b/deepdoc/parser/txt_parser.py @@ -15,7 +15,7 @@ from rag.nlp import find_codec,num_tokens_from_string import re class RAGFlowTxtParser: - def __call__(self, fnm, binary=None, chunk_token_num=128): + def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"): txt = "" if binary: encoding = find_codec(binary) @@ -27,7 +27,7 @@ class RAGFlowTxtParser: if not l: break txt += l - return self.parser_txt(txt, chunk_token_num) + return self.parser_txt(txt, chunk_token_num, delimiter) @classmethod def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):