optimize text parser (#2144)

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu 2024-08-28 18:11:19 +08:00 committed by GitHub
parent 54f7c6ea8e
commit a0b7c78dca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -33,14 +33,30 @@ class RAGFlowTxtParser:
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
if type(txt) != str:
raise TypeError("txt type should be str!")
sections = []
for sec in re.split(r"[%s]+"%delimiter, txt):
if sections and sec in delimiter:
sections[-1][0] += sec
continue
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
sections.append([sec[: int(len(sec) / 2)], ""])
sections.append([sec[int(len(sec) / 2) :], ""])
cks = [""]
tk_nums = [0]
def add_chunk(t):
nonlocal cks, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if tnum < 8:
pos = ""
if tk_nums[-1] > chunk_token_num:
cks.append(t)
tk_nums.append(tnum)
else:
sections.append([sec, ""])
return sections
cks[-1] += t
tk_nums[-1] += tnum
s, e = 0, 1
while e < len(txt):
if txt[e] in delimiter:
add_chunk(txt[s: e + 1])
s = e + 1
e = s + 1
else:
e += 1
if s < e:
add_chunk(txt[s: e + 1])
return [[c,""] for c in cks]