mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 02:29:03 +08:00
accelerate tokenize (#3244)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
d3bb5e9f3d
commit
fbcc0bb408
@ -281,34 +281,49 @@ class RagTokenizer:
|
|||||||
print("[FW]", tks, s)
|
print("[FW]", tks, s)
|
||||||
print("[BW]", tks1, s1)
|
print("[BW]", tks1, s1)
|
||||||
|
|
||||||
diff = [0 for _ in range(max(len(tks1), len(tks)))]
|
i, j, _i, _j = 0, 0, 0, 0
|
||||||
for i in range(min(len(tks1), len(tks))):
|
same = 0
|
||||||
if tks[i] != tks1[i]:
|
while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
|
||||||
diff[i] = 1
|
same += 1
|
||||||
|
if same > 0: res.append(" ".join(tks[j: j + same]))
|
||||||
|
_i = i + same
|
||||||
|
_j = j + same
|
||||||
|
j = _j + 1
|
||||||
|
i = _i + 1
|
||||||
|
|
||||||
if s1 > s:
|
while i < len(tks1) and j < len(tks):
|
||||||
tks = tks1
|
tk1, tk = "".join(tks1[_i:i]), "".join(tks[_j:j])
|
||||||
|
if tk1 != tk:
|
||||||
i = 0
|
if len(tk1) > len(tk):
|
||||||
while i < len(tks):
|
j += 1
|
||||||
s = i
|
else:
|
||||||
while s < len(tks) and diff[s] == 0:
|
i += 1
|
||||||
s += 1
|
continue
|
||||||
if s == len(tks):
|
|
||||||
res.append(" ".join(tks[i:]))
|
|
||||||
break
|
|
||||||
if s > i:
|
|
||||||
res.append(" ".join(tks[i:s]))
|
|
||||||
|
|
||||||
e = s
|
|
||||||
while e < len(tks) and e - s < 5 and diff[e] == 1:
|
|
||||||
e += 1
|
|
||||||
|
|
||||||
|
if tks1[i] != tks[j]:
|
||||||
|
i += 1
|
||||||
|
j += 1
|
||||||
|
continue
|
||||||
|
# backward tokens from_i to i are different from forward tokens from _j to j.
|
||||||
tkslist = []
|
tkslist = []
|
||||||
self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
|
self.dfs_("".join(tks[_j:j]), 0, [], tkslist)
|
||||||
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
||||||
|
|
||||||
i = e + 1
|
same = 1
|
||||||
|
while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
|
||||||
|
same += 1
|
||||||
|
res.append(" ".join(tks[j: j + same]))
|
||||||
|
_i = i + same
|
||||||
|
_j = j + same
|
||||||
|
j = _j + 1
|
||||||
|
i = _i + 1
|
||||||
|
|
||||||
|
if _i < len(tks1):
|
||||||
|
assert _j < len(tks)
|
||||||
|
assert "".join(tks1[_i:]) == "".join(tks[_j:])
|
||||||
|
tkslist = []
|
||||||
|
self.dfs_("".join(tks[_j:]), 0, [], tkslist)
|
||||||
|
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
||||||
|
|
||||||
res = " ".join(self.english_normalize_(res))
|
res = " ".join(self.english_normalize_(res))
|
||||||
if self.DEBUG:
|
if self.DEBUG:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user