make language judgement robuster (#3287)

### What problem does this PR solve?



### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu 2024-11-08 12:48:11 +08:00 committed by GitHub
parent a2153d61ce
commit d88f0d43ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -63,9 +63,9 @@ class EsQueryer:
rag_tokenizer.tradi2simp(
rag_tokenizer.strQ2B(
txt.lower()))).strip()
txt = EsQueryer.rmWWW(txt)
if not self.isChinese(txt):
txt = EsQueryer.rmWWW(txt)
tks = rag_tokenizer.tokenize(txt).split(" ")
tks_w = self.tw.weights(tks)
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
@ -89,6 +89,7 @@ class EsQueryer:
return False
return True
txt = EsQueryer.rmWWW(txt)
qs, keywords = [], []
for tt in self.tw.split(txt)[:256]: # .split(" "):
if not tt: