From 56f473b680729c012de53f33ef59543d5be463d0 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 5 Dec 2024 14:51:19 +0800 Subject: [PATCH] Feat: Add question parameter to edit chunk modal (#3875) ### What problem does this PR solve? Close #3873 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/chunk_app.py | 11 +++++++--- api/apps/sdk/doc.py | 24 +++++++++++++++++++--- conf/infinity_mapping.json | 2 ++ rag/nlp/query.py | 1 + rag/nlp/search.py | 12 ++++++----- rag/svr/task_executor.py | 22 +++++++++++--------- sdk/python/ragflow_sdk/modules/chunk.py | 1 + sdk/python/ragflow_sdk/modules/document.py | 6 +++--- 8 files changed, 55 insertions(+), 24 deletions(-) diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index b6a978e31..9889615a4 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -68,6 +68,7 @@ def list_chunk(): "doc_id": sres.field[id]["doc_id"], "docnm_kwd": sres.field[id]["docnm_kwd"], "important_kwd": sres.field[id].get("important_kwd", []), + "question_kwd": sres.field[id].get("question_kwd", []), "image_id": sres.field[id].get("img_id", ""), "available_int": sres.field[id].get("available_int", 1), "positions": json.loads(sres.field[id].get("position_list", "[]")), @@ -115,7 +116,7 @@ def get(): @manager.route('/set', methods=['POST']) @login_required @validate_request("doc_id", "chunk_id", "content_with_weight", - "important_kwd") + "important_kwd", "question_kwd") def set(): req = request.json d = { @@ -125,6 +126,8 @@ def set(): d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["important_kwd"] = req["important_kwd"] d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) + d["question_kwd"] = req["question_kwd"] + d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"])) if "available_int" in req: d["available_int"] = req["available_int"] @@ -152,7 +155,7 @@ def set(): d = beAdoc(d, arr[0], arr[1], not any( [rag_tokenizer.is_chinese(t) for t in q + a])) - v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) + v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])]) v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] d["q_%d_vec" % len(v)] = v.tolist() settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id) @@ -213,6 +216,8 @@ def create(): d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["important_kwd"] = req.get("important_kwd", []) d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) + d["question_kwd"] = req.get("question_kwd", []) + d["question_tks"] = rag_tokenizer.tokenize("\n".join(req.get("question_kwd", []))) d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.datetime.now().timestamp() @@ -237,7 +242,7 @@ def create(): embd_id = DocumentService.get_embd_id(req["doc_id"]) embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id) - v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) + v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])]) v = 0.1 * v[0] + 0.9 * v[1] d["q_%d_vec" % len(v)] = v.tolist() settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 0dc97ff7a..0132cd994 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -844,6 +844,7 @@ def list_chunks(tenant_id, dataset_id, document_id): "doc_id": sres.field[id]["doc_id"], "docnm_kwd": sres.field[id]["docnm_kwd"], "important_kwd": sres.field[id].get("important_kwd", []), + "question_kwd": sres.field[id].get("question_kwd", []), "img_id": sres.field[id].get("img_id", ""), "available_int": sres.field[id].get("available_int", 1), "positions": sres.field[id].get("position_int", "").split("\t"), @@ -879,6 +880,7 @@ def list_chunks(tenant_id, dataset_id, document_id): "content_with_weight": "content", "doc_id": "document_id", "important_kwd": "important_keywords", + "question_kwd": "questions", "img_id": "image_id", "available_int": "available", } @@ -978,6 +980,11 @@ def add_chunk(tenant_id, dataset_id, document_id): return get_error_data_result( "`important_keywords` is required to be a list" ) + if "questions" in req: + if type(req["questions"]) != list: + return get_error_data_result( + "`questions` is required to be a list" + ) md5 = hashlib.md5() md5.update((req["content"] + document_id).encode("utf-8")) @@ -992,6 +999,10 @@ def add_chunk(tenant_id, dataset_id, document_id): d["important_tks"] = rag_tokenizer.tokenize( " ".join(req.get("important_keywords", [])) ) + d["question_kwd"] = req.get("questions", []) + d["question_tks"] = rag_tokenizer.tokenize( + "\n".join(req.get("questions", [])) + ) d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.datetime.now().timestamp() d["kb_id"] = dataset_id @@ -1001,7 +1012,7 @@ def add_chunk(tenant_id, dataset_id, document_id): embd_mdl = TenantLLMService.model_instance( tenant_id, LLMType.EMBEDDING.value, embd_id ) - v, c = embd_mdl.encode([doc.name, req["content"]]) + v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])]) v = 0.1 * v[0] + 0.9 * v[1] d["q_%d_vec" % len(v)] = v.tolist() settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id) @@ -1013,6 +1024,7 @@ def add_chunk(tenant_id, dataset_id, document_id): "content_with_weight": "content", "doc_id": "document_id", "important_kwd": "important_keywords", + "question_kwd": "questions", "kb_id": "dataset_id", "create_timestamp_flt": "create_timestamp", "create_time": "create_time", @@ -1166,8 +1178,13 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id): if "important_keywords" in req: if not isinstance(req["important_keywords"], list): return get_error_data_result("`important_keywords` should be a list") - d["important_kwd"] = req.get("important_keywords") + d["important_kwd"] = req.get("important_keywords", []) d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) + if "questions" in req: + if not isinstance(req["questions"], list): + return get_error_data_result("`questions` should be a list") + d["question_kwd"] = req.get("questions") + d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["questions"])) if "available" in req: d["available_int"] = int(req["available"]) embd_id = DocumentService.get_embd_id(document_id) @@ -1185,7 +1202,7 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id): d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a]) ) - v, c = embd_mdl.encode([doc.name, d["content_with_weight"]]) + v, c = embd_mdl.encode([doc.name, d["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])]) v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] d["q_%d_vec" % len(v)] = v.tolist() settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id) @@ -1353,6 +1370,7 @@ def retrieval_test(tenant_id): "content_with_weight": "content", "doc_id": "document_id", "important_kwd": "important_keywords", + "question_kwd": "questions", "docnm_kwd": "document_keyword", } rename_chunk = {} diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index 743ba90d7..a9d1d4f0e 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -11,6 +11,8 @@ "name_kwd": {"type": "varchar", "default": ""}, "important_kwd": {"type": "varchar", "default": ""}, "important_tks": {"type": "varchar", "default": ""}, + "question_kwd": {"type": "varchar", "default": ""}, + "question_tks": {"type": "varchar", "default": ""}, "content_with_weight": {"type": "varchar", "default": ""}, "content_ltks": {"type": "varchar", "default": ""}, "content_sm_ltks": {"type": "varchar", "default": ""}, diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 3243d2a80..6c018d2ca 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -31,6 +31,7 @@ class FulltextQueryer: "title_sm_tks^5", "important_kwd^30", "important_tks^20", + "question_tks^20", "content_ltks^2", "content_sm_ltks", ] diff --git a/rag/nlp/search.py b/rag/nlp/search.py index f09bbfbda..c8de4df05 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -74,7 +74,7 @@ class Dealer: offset, limit = pg * ps, (pg + 1) * ps src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", - "doc_id", "position_list", "knowledge_graph_kwd", + "doc_id", "position_list", "knowledge_graph_kwd", "question_kwd", "question_tks", "available_int", "content_with_weight", "pagerank_fea"]) kwds = set([]) @@ -251,8 +251,9 @@ class Dealer: for i in sres.ids: content_ltks = sres.field[i][cfield].split() title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t] + question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t] important_kwd = sres.field[i].get("important_kwd", []) - tks = content_ltks + title_tks*2 + important_kwd*5 + tks = content_ltks + title_tks*2 + important_kwd*5 + question_tks*6 ins_tw.append(tks) sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector, @@ -322,11 +323,14 @@ class Dealer: sim = tsim = vsim = [1]*len(sres.ids) idx = list(range(len(sres.ids))) + def floor_sim(score): + return (int(score * 100.)%100)/100. + dim = len(sres.query_vector) vector_column = f"q_{dim}_vec" zero_vector = [0.0] * dim for i in idx: - if sim[i] < similarity_threshold: + if floor_sim(sim[i]) < similarity_threshold: break if len(ranks["chunks"]) >= page_size: if aggs: @@ -337,8 +341,6 @@ class Dealer: dnm = chunk["docnm_kwd"] did = chunk["doc_id"] position_list = chunk.get("position_list", "[]") - if not position_list: - position_list = "[]" d = { "chunk_id": id, "content_ltks": chunk["content_ltks"], diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 05605f5dd..7ab4b7254 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -255,13 +255,8 @@ def build_chunks(task, progress_callback): progress_callback(msg="Start to generate questions for every chunk ...") chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"]) for d in docs: - qst = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"]) - d["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + d["content_with_weight"] - qst = rag_tokenizer.tokenize(qst) - if "content_ltks" in d: - d["content_ltks"] += " " + qst - if "content_sm_ltks" in d: - d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst) + d["question_kwd"] = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"]).split("\n") + d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"])) progress_callback(msg="Question generation completed in {:.2f}s".format(timer() - st)) return docs @@ -275,9 +270,16 @@ def init_kb(row, vector_size: int): def embedding(docs, mdl, parser_config=None, callback=None): if parser_config is None: parser_config = {} - batch_size = 32 - tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [ - re.sub(r"]{0,12})?>", " ", d["content_with_weight"]) for d in docs] + batch_size = 16 + tts, cnts = [], [] + for d in docs: + tts.append(rmSpace(d["title_tks"])) + c = "\n".join(d.get("question_kwd", [])) + if not c: + c = d["content_with_weight"] + c = re.sub(r"]{0,12})?>", " ", c) + cnts.append(c) + tk_count = 0 if len(tts) == len(cnts): tts_ = np.array([]) diff --git a/sdk/python/ragflow_sdk/modules/chunk.py b/sdk/python/ragflow_sdk/modules/chunk.py index 2ddd6ad25..960c87df6 100644 --- a/sdk/python/ragflow_sdk/modules/chunk.py +++ b/sdk/python/ragflow_sdk/modules/chunk.py @@ -6,6 +6,7 @@ class Chunk(Base): self.id = "" self.content = "" self.important_keywords = [] + self.questions = [] self.create_time = "" self.create_timestamp = 0.0 self.dataset_id = None diff --git a/sdk/python/ragflow_sdk/modules/document.py b/sdk/python/ragflow_sdk/modules/document.py index 62728636e..743016b23 100644 --- a/sdk/python/ragflow_sdk/modules/document.py +++ b/sdk/python/ragflow_sdk/modules/document.py @@ -61,9 +61,9 @@ class Document(Base): return chunks raise Exception(res.get("message")) - - def add_chunk(self, content: str,important_keywords: list[str] = []): - res = self.post(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks', {"content":content,"important_keywords":important_keywords}) + def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = []): + res = self.post(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks', + {"content":content,"important_keywords":important_keywords, "questions": questions}) res = res.json() if res.get("code") == 0: return Chunk(self.rag,res["data"].get("chunk"))