mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-06-04 11:24:00 +08:00
optimize text parser (#2144)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
54f7c6ea8e
commit
a0b7c78dca
@ -33,14 +33,30 @@ class RAGFlowTxtParser:
|
|||||||
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
||||||
if type(txt) != str:
|
if type(txt) != str:
|
||||||
raise TypeError("txt type should be str!")
|
raise TypeError("txt type should be str!")
|
||||||
sections = []
|
cks = [""]
|
||||||
for sec in re.split(r"[%s]+"%delimiter, txt):
|
tk_nums = [0]
|
||||||
if sections and sec in delimiter:
|
|
||||||
sections[-1][0] += sec
|
def add_chunk(t):
|
||||||
continue
|
nonlocal cks, tk_nums, delimiter
|
||||||
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
|
tnum = num_tokens_from_string(t)
|
||||||
sections.append([sec[: int(len(sec) / 2)], ""])
|
if tnum < 8:
|
||||||
sections.append([sec[int(len(sec) / 2) :], ""])
|
pos = ""
|
||||||
|
if tk_nums[-1] > chunk_token_num:
|
||||||
|
cks.append(t)
|
||||||
|
tk_nums.append(tnum)
|
||||||
else:
|
else:
|
||||||
sections.append([sec, ""])
|
cks[-1] += t
|
||||||
return sections
|
tk_nums[-1] += tnum
|
||||||
|
|
||||||
|
s, e = 0, 1
|
||||||
|
while e < len(txt):
|
||||||
|
if txt[e] in delimiter:
|
||||||
|
add_chunk(txt[s: e + 1])
|
||||||
|
s = e + 1
|
||||||
|
e = s + 1
|
||||||
|
else:
|
||||||
|
e += 1
|
||||||
|
if s < e:
|
||||||
|
add_chunk(txt[s: e + 1])
|
||||||
|
|
||||||
|
return [[c,""] for c in cks]
|
Loading…
x
Reference in New Issue
Block a user