Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu 2024-12-05 14:51:19 +08:00 committed by GitHub
parent b502dc7399
commit 56f473b680
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 55 additions and 24 deletions

View File

@ -68,6 +68,7 @@ def list_chunk():
"doc_id": sres.field[id]["doc_id"], "doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"], "docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []), "important_kwd": sres.field[id].get("important_kwd", []),
"question_kwd": sres.field[id].get("question_kwd", []),
"image_id": sres.field[id].get("img_id", ""), "image_id": sres.field[id].get("img_id", ""),
"available_int": sres.field[id].get("available_int", 1), "available_int": sres.field[id].get("available_int", 1),
"positions": json.loads(sres.field[id].get("position_list", "[]")), "positions": json.loads(sres.field[id].get("position_list", "[]")),
@ -115,7 +116,7 @@ def get():
@manager.route('/set', methods=['POST']) @manager.route('/set', methods=['POST'])
@login_required @login_required
@validate_request("doc_id", "chunk_id", "content_with_weight", @validate_request("doc_id", "chunk_id", "content_with_weight",
"important_kwd") "important_kwd", "question_kwd")
def set(): def set():
req = request.json req = request.json
d = { d = {
@ -125,6 +126,8 @@ def set():
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["important_kwd"] = req["important_kwd"] d["important_kwd"] = req["important_kwd"]
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
d["question_kwd"] = req["question_kwd"]
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
if "available_int" in req: if "available_int" in req:
d["available_int"] = req["available_int"] d["available_int"] = req["available_int"]
@ -152,7 +155,7 @@ def set():
d = beAdoc(d, arr[0], arr[1], not any( d = beAdoc(d, arr[0], arr[1], not any(
[rag_tokenizer.is_chinese(t) for t in q + a])) [rag_tokenizer.is_chinese(t) for t in q + a]))
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist() d["q_%d_vec" % len(v)] = v.tolist()
settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id) settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
@ -213,6 +216,8 @@ def create():
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["important_kwd"] = req.get("important_kwd", []) d["important_kwd"] = req.get("important_kwd", [])
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
d["question_kwd"] = req.get("question_kwd", [])
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req.get("question_kwd", [])))
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp() d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
@ -237,7 +242,7 @@ def create():
embd_id = DocumentService.get_embd_id(req["doc_id"]) embd_id = DocumentService.get_embd_id(req["doc_id"])
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id) embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id)
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
v = 0.1 * v[0] + 0.9 * v[1] v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist() d["q_%d_vec" % len(v)] = v.tolist()
settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id) settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)

View File

@ -844,6 +844,7 @@ def list_chunks(tenant_id, dataset_id, document_id):
"doc_id": sres.field[id]["doc_id"], "doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"], "docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []), "important_kwd": sres.field[id].get("important_kwd", []),
"question_kwd": sres.field[id].get("question_kwd", []),
"img_id": sres.field[id].get("img_id", ""), "img_id": sres.field[id].get("img_id", ""),
"available_int": sres.field[id].get("available_int", 1), "available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t"), "positions": sres.field[id].get("position_int", "").split("\t"),
@ -879,6 +880,7 @@ def list_chunks(tenant_id, dataset_id, document_id):
"content_with_weight": "content", "content_with_weight": "content",
"doc_id": "document_id", "doc_id": "document_id",
"important_kwd": "important_keywords", "important_kwd": "important_keywords",
"question_kwd": "questions",
"img_id": "image_id", "img_id": "image_id",
"available_int": "available", "available_int": "available",
} }
@ -978,6 +980,11 @@ def add_chunk(tenant_id, dataset_id, document_id):
return get_error_data_result( return get_error_data_result(
"`important_keywords` is required to be a list" "`important_keywords` is required to be a list"
) )
if "questions" in req:
if type(req["questions"]) != list:
return get_error_data_result(
"`questions` is required to be a list"
)
md5 = hashlib.md5() md5 = hashlib.md5()
md5.update((req["content"] + document_id).encode("utf-8")) md5.update((req["content"] + document_id).encode("utf-8"))
@ -992,6 +999,10 @@ def add_chunk(tenant_id, dataset_id, document_id):
d["important_tks"] = rag_tokenizer.tokenize( d["important_tks"] = rag_tokenizer.tokenize(
" ".join(req.get("important_keywords", [])) " ".join(req.get("important_keywords", []))
) )
d["question_kwd"] = req.get("questions", [])
d["question_tks"] = rag_tokenizer.tokenize(
"\n".join(req.get("questions", []))
)
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp() d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
d["kb_id"] = dataset_id d["kb_id"] = dataset_id
@ -1001,7 +1012,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
embd_mdl = TenantLLMService.model_instance( embd_mdl = TenantLLMService.model_instance(
tenant_id, LLMType.EMBEDDING.value, embd_id tenant_id, LLMType.EMBEDDING.value, embd_id
) )
v, c = embd_mdl.encode([doc.name, req["content"]]) v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
v = 0.1 * v[0] + 0.9 * v[1] v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist() d["q_%d_vec" % len(v)] = v.tolist()
settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id) settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
@ -1013,6 +1024,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
"content_with_weight": "content", "content_with_weight": "content",
"doc_id": "document_id", "doc_id": "document_id",
"important_kwd": "important_keywords", "important_kwd": "important_keywords",
"question_kwd": "questions",
"kb_id": "dataset_id", "kb_id": "dataset_id",
"create_timestamp_flt": "create_timestamp", "create_timestamp_flt": "create_timestamp",
"create_time": "create_time", "create_time": "create_time",
@ -1166,8 +1178,13 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
if "important_keywords" in req: if "important_keywords" in req:
if not isinstance(req["important_keywords"], list): if not isinstance(req["important_keywords"], list):
return get_error_data_result("`important_keywords` should be a list") return get_error_data_result("`important_keywords` should be a list")
d["important_kwd"] = req.get("important_keywords") d["important_kwd"] = req.get("important_keywords", [])
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
if "questions" in req:
if not isinstance(req["questions"], list):
return get_error_data_result("`questions` should be a list")
d["question_kwd"] = req.get("questions")
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["questions"]))
if "available" in req: if "available" in req:
d["available_int"] = int(req["available"]) d["available_int"] = int(req["available"])
embd_id = DocumentService.get_embd_id(document_id) embd_id = DocumentService.get_embd_id(document_id)
@ -1185,7 +1202,7 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a]) d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
) )
v, c = embd_mdl.encode([doc.name, d["content_with_weight"]]) v, c = embd_mdl.encode([doc.name, d["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist() d["q_%d_vec" % len(v)] = v.tolist()
settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id) settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
@ -1353,6 +1370,7 @@ def retrieval_test(tenant_id):
"content_with_weight": "content", "content_with_weight": "content",
"doc_id": "document_id", "doc_id": "document_id",
"important_kwd": "important_keywords", "important_kwd": "important_keywords",
"question_kwd": "questions",
"docnm_kwd": "document_keyword", "docnm_kwd": "document_keyword",
} }
rename_chunk = {} rename_chunk = {}

View File

@ -11,6 +11,8 @@
"name_kwd": {"type": "varchar", "default": ""}, "name_kwd": {"type": "varchar", "default": ""},
"important_kwd": {"type": "varchar", "default": ""}, "important_kwd": {"type": "varchar", "default": ""},
"important_tks": {"type": "varchar", "default": ""}, "important_tks": {"type": "varchar", "default": ""},
"question_kwd": {"type": "varchar", "default": ""},
"question_tks": {"type": "varchar", "default": ""},
"content_with_weight": {"type": "varchar", "default": ""}, "content_with_weight": {"type": "varchar", "default": ""},
"content_ltks": {"type": "varchar", "default": ""}, "content_ltks": {"type": "varchar", "default": ""},
"content_sm_ltks": {"type": "varchar", "default": ""}, "content_sm_ltks": {"type": "varchar", "default": ""},

View File

@ -31,6 +31,7 @@ class FulltextQueryer:
"title_sm_tks^5", "title_sm_tks^5",
"important_kwd^30", "important_kwd^30",
"important_tks^20", "important_tks^20",
"question_tks^20",
"content_ltks^2", "content_ltks^2",
"content_sm_ltks", "content_sm_ltks",
] ]

View File

@ -74,7 +74,7 @@ class Dealer:
offset, limit = pg * ps, (pg + 1) * ps offset, limit = pg * ps, (pg + 1) * ps
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
"doc_id", "position_list", "knowledge_graph_kwd", "doc_id", "position_list", "knowledge_graph_kwd", "question_kwd", "question_tks",
"available_int", "content_with_weight", "pagerank_fea"]) "available_int", "content_with_weight", "pagerank_fea"])
kwds = set([]) kwds = set([])
@ -251,8 +251,9 @@ class Dealer:
for i in sres.ids: for i in sres.ids:
content_ltks = sres.field[i][cfield].split() content_ltks = sres.field[i][cfield].split()
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t] title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
important_kwd = sres.field[i].get("important_kwd", []) important_kwd = sres.field[i].get("important_kwd", [])
tks = content_ltks + title_tks*2 + important_kwd*5 tks = content_ltks + title_tks*2 + important_kwd*5 + question_tks*6
ins_tw.append(tks) ins_tw.append(tks)
sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector, sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
@ -322,11 +323,14 @@ class Dealer:
sim = tsim = vsim = [1]*len(sres.ids) sim = tsim = vsim = [1]*len(sres.ids)
idx = list(range(len(sres.ids))) idx = list(range(len(sres.ids)))
def floor_sim(score):
return (int(score * 100.)%100)/100.
dim = len(sres.query_vector) dim = len(sres.query_vector)
vector_column = f"q_{dim}_vec" vector_column = f"q_{dim}_vec"
zero_vector = [0.0] * dim zero_vector = [0.0] * dim
for i in idx: for i in idx:
if sim[i] < similarity_threshold: if floor_sim(sim[i]) < similarity_threshold:
break break
if len(ranks["chunks"]) >= page_size: if len(ranks["chunks"]) >= page_size:
if aggs: if aggs:
@ -337,8 +341,6 @@ class Dealer:
dnm = chunk["docnm_kwd"] dnm = chunk["docnm_kwd"]
did = chunk["doc_id"] did = chunk["doc_id"]
position_list = chunk.get("position_list", "[]") position_list = chunk.get("position_list", "[]")
if not position_list:
position_list = "[]"
d = { d = {
"chunk_id": id, "chunk_id": id,
"content_ltks": chunk["content_ltks"], "content_ltks": chunk["content_ltks"],

View File

@ -255,13 +255,8 @@ def build_chunks(task, progress_callback):
progress_callback(msg="Start to generate questions for every chunk ...") progress_callback(msg="Start to generate questions for every chunk ...")
chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"]) chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
for d in docs: for d in docs:
qst = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"]) d["question_kwd"] = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"]).split("\n")
d["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + d["content_with_weight"] d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
qst = rag_tokenizer.tokenize(qst)
if "content_ltks" in d:
d["content_ltks"] += " " + qst
if "content_sm_ltks" in d:
d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
progress_callback(msg="Question generation completed in {:.2f}s".format(timer() - st)) progress_callback(msg="Question generation completed in {:.2f}s".format(timer() - st))
return docs return docs
@ -275,9 +270,16 @@ def init_kb(row, vector_size: int):
def embedding(docs, mdl, parser_config=None, callback=None): def embedding(docs, mdl, parser_config=None, callback=None):
if parser_config is None: if parser_config is None:
parser_config = {} parser_config = {}
batch_size = 32 batch_size = 16
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [ tts, cnts = [], []
re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs] for d in docs:
tts.append(rmSpace(d["title_tks"]))
c = "\n".join(d.get("question_kwd", []))
if not c:
c = d["content_with_weight"]
c = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c)
cnts.append(c)
tk_count = 0 tk_count = 0
if len(tts) == len(cnts): if len(tts) == len(cnts):
tts_ = np.array([]) tts_ = np.array([])

View File

@ -6,6 +6,7 @@ class Chunk(Base):
self.id = "" self.id = ""
self.content = "" self.content = ""
self.important_keywords = [] self.important_keywords = []
self.questions = []
self.create_time = "" self.create_time = ""
self.create_timestamp = 0.0 self.create_timestamp = 0.0
self.dataset_id = None self.dataset_id = None

View File

@ -61,9 +61,9 @@ class Document(Base):
return chunks return chunks
raise Exception(res.get("message")) raise Exception(res.get("message"))
def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = []):
def add_chunk(self, content: str,important_keywords: list[str] = []): res = self.post(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks',
res = self.post(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks', {"content":content,"important_keywords":important_keywords}) {"content":content,"important_keywords":important_keywords, "questions": questions})
res = res.json() res = res.json()
if res.get("code") == 0: if res.get("code") == 0:
return Chunk(self.rag,res["data"].get("chunk")) return Chunk(self.rag,res["data"].get("chunk"))