Refine english synonym (#3371)

### What problem does this PR solve?

#3361

### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu 2024-11-13 12:58:37 +08:00 committed by GitHub
parent 0c95a3382b
commit 91332fa0f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 73 additions and 5 deletions

View File

@ -446,9 +446,22 @@ class ComponentBase(ABC):
outs = []
for q in self._param.query:
if q["component_id"]:
if q["component_id"].split("@")[0].lower().find("begin") > 0:
cpn_id, key = q["component_id"].split("@")
for p in self._canvas.get_component(cpn_id)["obj"]._param.query:
if p["key"] == key:
outs.append(pd.DataFrame([{"content": p["value"]}]))
self._param.inputs.append({"component_id": q["component_id"],
"content": p["value"]})
break
else:
assert False, f"Can't find parameter '{key}' for {cpn_id}"
continue
outs.append(self._canvas.get_component(q["component_id"])["obj"].output(allow_partial=False)[1])
self._param.inputs.append({"component_id": q["component_id"],
"content": "\n".join([str(d["content"]) for d in outs[-1].to_dict('records')])})
"content": "\n".join(
[str(d["content"]) for d in outs[-1].to_dict('records')])})
elif q["value"]:
self._param.inputs.append({"component_id": None, "content": q["value"]})
outs.append(pd.DataFrame([{"content": q["value"]}]))

View File

@ -104,6 +104,18 @@ class Generate(ComponentBase):
retrieval_res = []
self._param.inputs = []
for para in self._param.parameters:
if para["component_id"].split("@")[0].lower().find("begin") > 0:
cpn_id, key = para["component_id"].split("@")
for p in self._canvas.get_component(cpn_id)["obj"]._param.query:
if p["key"] == key:
kwargs[para["key"]] = p["value"]
self._param.inputs.append(
{"component_id": para["component_id"], "content": kwargs[para["key"]]})
break
else:
assert False, f"Can't find parameter '{key}' for {cpn_id}"
continue
cpn = self._canvas.get_component(para["component_id"])["obj"]
if cpn.component_name.lower() == "answer":
kwargs[para["key"]] = self._canvas.get_history(1)[0]["content"]

View File

@ -25,6 +25,7 @@ from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.task_service import TaskService, queue_tasks
from api.db.services.user_service import UserTenantService
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import search
from api.db.services import duplicate_name
from api.db.services.knowledgebase_service import KnowledgebaseService
@ -518,3 +519,32 @@ def upload_and_parse():
doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
return get_json_result(data=doc_ids)
@manager.route('/parse', methods=['POST'])
@login_required
def parse():
url = request.json.get("url")
if url:
if not is_valid_url(url):
return get_json_result(
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
from selenium.webdriver import Chrome, ChromeOptions
options = ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = Chrome(options=options)
driver.get(url)
sections = RAGFlowHtmlParser()(driver.page_source)
return get_json_result(data="\n".join(sections))
if 'file' not in request.files:
return get_json_result(
data=False, message='No file part!', code=RetCode.ARGUMENT_ERROR)
file_objs = request.files.getlist('file')
txt = FileService.parse_docs(file_objs, current_user.id)
return get_json_result(data=txt)

View File

@ -75,11 +75,20 @@ class FulltextQueryer:
if not self.isChinese(txt):
txt = FulltextQueryer.rmWWW(txt)
tks = rag_tokenizer.tokenize(txt).split(" ")
tks_w = self.tw.weights(tks)
keywords = [t for t in tks if t]
tks_w = self.tw.weights(tks, preprocess=False)
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk]
tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
q = ["{}^{:.4f}".format(tk, w) for tk, w in tks_w if tk]
syns = []
for tk, w in tks_w:
syn = self.syn.lookup(tk)
syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ")
keywords.extend(syn)
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
syns.append(" ".join(syn))
q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
for i in range(1, len(tks_w)):
q.append(
'"%s %s"^%.4f'
@ -94,7 +103,7 @@ class FulltextQueryer:
query = " ".join(q)
return MatchTextExpr(
self.query_fields, query, 100
), tks
), keywords
def need_fine_grained_tokenize(tk):
if len(tk) < 3:

View File

@ -18,7 +18,7 @@ import json
import os
import time
import re
from nltk.corpus import wordnet
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
@ -67,6 +67,10 @@ class Dealer:
logger.error("Fail to load synonym!" + str(e))
def lookup(self, tk):
if re.match(r"[a-z]+$", tk):
res = list(set([re.sub("_", " ", syn.name().split(".")[0]) for syn in wordnet.synsets("love")]) - set([tk]))
return [t for t in res if t]
self.lookup_num += 1
self.load()
res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])