diff --git a/agent/component/base.py b/agent/component/base.py index 2029e7a4d..dbab23c57 100644 --- a/agent/component/base.py +++ b/agent/component/base.py @@ -446,9 +446,22 @@ class ComponentBase(ABC): outs = [] for q in self._param.query: if q["component_id"]: + if q["component_id"].split("@")[0].lower().find("begin") > 0: + cpn_id, key = q["component_id"].split("@") + for p in self._canvas.get_component(cpn_id)["obj"]._param.query: + if p["key"] == key: + outs.append(pd.DataFrame([{"content": p["value"]}])) + self._param.inputs.append({"component_id": q["component_id"], + "content": p["value"]}) + break + else: + assert False, f"Can't find parameter '{key}' for {cpn_id}" + continue + outs.append(self._canvas.get_component(q["component_id"])["obj"].output(allow_partial=False)[1]) self._param.inputs.append({"component_id": q["component_id"], - "content": "\n".join([str(d["content"]) for d in outs[-1].to_dict('records')])}) + "content": "\n".join( + [str(d["content"]) for d in outs[-1].to_dict('records')])}) elif q["value"]: self._param.inputs.append({"component_id": None, "content": q["value"]}) outs.append(pd.DataFrame([{"content": q["value"]}])) diff --git a/agent/component/generate.py b/agent/component/generate.py index ab5b07ed7..65ce5472e 100644 --- a/agent/component/generate.py +++ b/agent/component/generate.py @@ -104,6 +104,18 @@ class Generate(ComponentBase): retrieval_res = [] self._param.inputs = [] for para in self._param.parameters: + if para["component_id"].split("@")[0].lower().find("begin") > 0: + cpn_id, key = para["component_id"].split("@") + for p in self._canvas.get_component(cpn_id)["obj"]._param.query: + if p["key"] == key: + kwargs[para["key"]] = p["value"] + self._param.inputs.append( + {"component_id": para["component_id"], "content": kwargs[para["key"]]}) + break + else: + assert False, f"Can't find parameter '{key}' for {cpn_id}" + continue + cpn = self._canvas.get_component(para["component_id"])["obj"] if cpn.component_name.lower() == "answer": kwargs[para["key"]] = self._canvas.get_history(1)[0]["content"] diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 19f5e20b4..a43ec5783 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -25,6 +25,7 @@ from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.task_service import TaskService, queue_tasks from api.db.services.user_service import UserTenantService +from deepdoc.parser.html_parser import RAGFlowHtmlParser from rag.nlp import search from api.db.services import duplicate_name from api.db.services.knowledgebase_service import KnowledgebaseService @@ -518,3 +519,32 @@ def upload_and_parse(): doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id) return get_json_result(data=doc_ids) + + +@manager.route('/parse', methods=['POST']) +@login_required +def parse(): + url = request.json.get("url") + if url: + if not is_valid_url(url): + return get_json_result( + data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR) + from selenium.webdriver import Chrome, ChromeOptions + options = ChromeOptions() + options.add_argument('--headless') + options.add_argument('--disable-gpu') + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + driver = Chrome(options=options) + driver.get(url) + sections = RAGFlowHtmlParser()(driver.page_source) + return get_json_result(data="\n".join(sections)) + + if 'file' not in request.files: + return get_json_result( + data=False, message='No file part!', code=RetCode.ARGUMENT_ERROR) + + file_objs = request.files.getlist('file') + txt = FileService.parse_docs(file_objs, current_user.id) + + return get_json_result(data=txt) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index e5b55401f..390b514e8 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -75,11 +75,20 @@ class FulltextQueryer: if not self.isChinese(txt): txt = FulltextQueryer.rmWWW(txt) tks = rag_tokenizer.tokenize(txt).split(" ") - tks_w = self.tw.weights(tks) + keywords = [t for t in tks if t] + tks_w = self.tw.weights(tks, preprocess=False) tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w] tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk] tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk] - q = ["{}^{:.4f}".format(tk, w) for tk, w in tks_w if tk] + syns = [] + for tk, w in tks_w: + syn = self.syn.lookup(tk) + syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ") + keywords.extend(syn) + syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn] + syns.append(" ".join(syn)) + + q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)] for i in range(1, len(tks_w)): q.append( '"%s %s"^%.4f' @@ -94,7 +103,7 @@ class FulltextQueryer: query = " ".join(q) return MatchTextExpr( self.query_fields, query, 100 - ), tks + ), keywords def need_fine_grained_tokenize(tk): if len(tk) < 3: diff --git a/rag/nlp/synonym.py b/rag/nlp/synonym.py index 5b0f4fad0..1575b5344 100644 --- a/rag/nlp/synonym.py +++ b/rag/nlp/synonym.py @@ -18,7 +18,7 @@ import json import os import time import re - +from nltk.corpus import wordnet from api.utils.file_utils import get_project_base_directory from api.utils.log_utils import logger @@ -67,6 +67,10 @@ class Dealer: logger.error("Fail to load synonym!" + str(e)) def lookup(self, tk): + if re.match(r"[a-z]+$", tk): + res = list(set([re.sub("_", " ", syn.name().split(".")[0]) for syn in wordnet.synsets("love")]) - set([tk])) + return [t for t in res if t] + self.lookup_num += 1 self.load() res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])