From a0b7c78dca0257a06bbedd78915dc42343f5f30a Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Wed, 28 Aug 2024 18:11:19 +0800 Subject: [PATCH] optimize text parser (#2144) ### What problem does this PR solve? ### Type of change - [x] Performance Improvement --- deepdoc/parser/txt_parser.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/deepdoc/parser/txt_parser.py b/deepdoc/parser/txt_parser.py index 8a322a128..9d723fb99 100644 --- a/deepdoc/parser/txt_parser.py +++ b/deepdoc/parser/txt_parser.py @@ -33,14 +33,30 @@ class RAGFlowTxtParser: def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"): if type(txt) != str: raise TypeError("txt type should be str!") - sections = [] - for sec in re.split(r"[%s]+"%delimiter, txt): - if sections and sec in delimiter: - sections[-1][0] += sec - continue - if num_tokens_from_string(sec) > 10 * int(chunk_token_num): - sections.append([sec[: int(len(sec) / 2)], ""]) - sections.append([sec[int(len(sec) / 2) :], ""]) + cks = [""] + tk_nums = [0] + + def add_chunk(t): + nonlocal cks, tk_nums, delimiter + tnum = num_tokens_from_string(t) + if tnum < 8: + pos = "" + if tk_nums[-1] > chunk_token_num: + cks.append(t) + tk_nums.append(tnum) else: - sections.append([sec, ""]) - return sections \ No newline at end of file + cks[-1] += t + tk_nums[-1] += tnum + + s, e = 0, 1 + while e < len(txt): + if txt[e] in delimiter: + add_chunk(txt[s: e + 1]) + s = e + 1 + e = s + 1 + else: + e += 1 + if s < e: + add_chunk(txt[s: e + 1]) + + return [[c,""] for c in cks] \ No newline at end of file