mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-06-04 11:24:00 +08:00
optimize text parser (#2144)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
54f7c6ea8e
commit
a0b7c78dca
@ -33,14 +33,30 @@ class RAGFlowTxtParser:
|
||||
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
||||
if type(txt) != str:
|
||||
raise TypeError("txt type should be str!")
|
||||
sections = []
|
||||
for sec in re.split(r"[%s]+"%delimiter, txt):
|
||||
if sections and sec in delimiter:
|
||||
sections[-1][0] += sec
|
||||
continue
|
||||
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
|
||||
sections.append([sec[: int(len(sec) / 2)], ""])
|
||||
sections.append([sec[int(len(sec) / 2) :], ""])
|
||||
cks = [""]
|
||||
tk_nums = [0]
|
||||
|
||||
def add_chunk(t):
|
||||
nonlocal cks, tk_nums, delimiter
|
||||
tnum = num_tokens_from_string(t)
|
||||
if tnum < 8:
|
||||
pos = ""
|
||||
if tk_nums[-1] > chunk_token_num:
|
||||
cks.append(t)
|
||||
tk_nums.append(tnum)
|
||||
else:
|
||||
sections.append([sec, ""])
|
||||
return sections
|
||||
cks[-1] += t
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
s, e = 0, 1
|
||||
while e < len(txt):
|
||||
if txt[e] in delimiter:
|
||||
add_chunk(txt[s: e + 1])
|
||||
s = e + 1
|
||||
e = s + 1
|
||||
else:
|
||||
e += 1
|
||||
if s < e:
|
||||
add_chunk(txt[s: e + 1])
|
||||
|
||||
return [[c,""] for c in cks]
|
Loading…
x
Reference in New Issue
Block a user