Fix issues in API (#3008)

### What problem does this PR solve?

Fix issues in API

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
This commit is contained in:
liuhua 2024-10-24 20:10:47 +08:00 committed by GitHub
parent 161c7a231b
commit 648f8e81d1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 187 additions and 205 deletions

View File

@ -64,7 +64,12 @@ def create(tenant_id):
if not req.get("embedding_model"):
req['embedding_model'] = t.embd_id
else:
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
"BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
"nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
"text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
and req.get("embedding_model") not in valid_embedding_models:
return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
key_mapping = {
"chunk_num": "chunk_count",
@ -133,6 +138,9 @@ def update(tenant_id,dataset_id):
return get_error_data_result(
retmsg="Can't change `tenant_id`.")
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if "parser_config" in req:
print(kb.parser_config,flush=True)
req["parser_config"]=kb.parser_config.update(req["parser_config"])
if "chunk_count" in req:
if req["chunk_count"] != kb.chunk_num:
return get_error_data_result(
@ -153,10 +161,15 @@ def update(tenant_id,dataset_id):
if "embedding_model" in req:
if kb.chunk_num != 0 and req['embedding_model'] != kb.embd_id:
return get_error_data_result(
retmsg="If `chunk_count` is not 0, `embedding_method` is not changeable.")
retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable.")
if not req.get("embedding_model"):
return get_error_data_result("`embedding_model` can't be empty")
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
"BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
"nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
"text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
and req.get("embedding_model") not in valid_embedding_models:
return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
req['embd_id'] = req.pop('embedding_model')
if "name" in req:

View File

@ -163,9 +163,6 @@ def update_doc(tenant_id, dataset_id, document_id):
doc.process_duation * -1)
if not e:
return get_error_data_result(retmsg="Document not found!")
tenant_id = DocumentService.get_tenant_id(req["id"])
if not tenant_id:
return get_error_data_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
@ -245,14 +242,22 @@ def delete(tenant_id,dataset_id):
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
req = request.json
if not req.get("ids"):
return get_error_data_result(retmsg="`ids` is required")
doc_ids = req["ids"]
if not req:
doc_ids=None
else:
doc_ids=req.get("ids")
if not doc_ids:
doc_list = []
docs=DocumentService.query(kb_id=dataset_id)
for doc in docs:
doc_list.append(doc.id)
else:
doc_list=doc_ids
root_folder = FileService.get_root_folder(tenant_id)
pf_id = root_folder["id"]
FileService.init_knowledgebase_docs(pf_id, tenant_id)
errors = ""
for doc_id in doc_ids:
for doc_id in doc_list:
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
@ -290,8 +295,11 @@ def parse(tenant_id,dataset_id):
if not req.get("document_ids"):
return get_error_data_result("`document_ids` is required")
for id in req["document_ids"]:
if not DocumentService.query(id=id,kb_id=dataset_id):
doc = DocumentService.query(id=id,kb_id=dataset_id)
if not doc:
return get_error_data_result(retmsg=f"You don't own the document {id}.")
if doc[0].progress != 0.0:
return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
info = {"run": "1", "progress": 0}
info["progress_msg"] = ""
info["chunk_num"] = 0
@ -349,7 +357,27 @@ def list_chunks(tenant_id,dataset_id,document_id):
"doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
}
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
key_mapping = {
"chunk_num": "chunk_count",
"kb_id": "dataset_id",
"token_num": "token_count",
"parser_id": "chunk_method"
}
run_mapping = {
"0": "UNSTART",
"1": "RUNNING",
"2": "CANCEL",
"3": "DONE",
"4": "FAIL"
}
doc=doc.to_dict()
renamed_doc = {}
for key, value in doc.items():
if key == "run":
renamed_doc["run"] = run_mapping.get(str(value))
new_key = key_mapping.get(key, key)
renamed_doc[new_key] = value
res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
origin_chunks = []
sign = 0
for id in sres.ids:
@ -388,7 +416,7 @@ def list_chunks(tenant_id,dataset_id,document_id):
"content_with_weight": "content",
"doc_id": "document_id",
"important_kwd": "important_keywords",
"img_id": "image_id",
"img_id": "image_id"
}
renamed_chunk = {}
for key, value in chunk.items():

View File

@ -31,7 +31,7 @@ Creates a dataset.
- `"language"`: `string`
- `"embedding_model"`: `string`
- `"permission"`: `string`
- `"parse_method"`: `string`
- `"chunk_method"`: `string`
- `"parser_config"`: `Dataset.ParserConfig`
#### Request example
@ -41,11 +41,9 @@ curl --request POST \
--url http://{address}/api/v1/dataset \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_API_KEY}' \
--data '
{
"name": "test",
"chunk_method": "naive"
}'
--data '{
"name": "test_1"
}'
```
#### Request parameters
@ -109,31 +107,32 @@ Success:
"data": {
"avatar": null,
"chunk_count": 0,
"create_date": "Thu, 10 Oct 2024 05:57:37 GMT",
"create_time": 1728539857641,
"chunk_method": "naive",
"create_date": "Thu, 24 Oct 2024 09:14:07 GMT",
"create_time": 1729761247434,
"created_by": "69736c5e723611efb51b0242ac120007",
"description": null,
"document_count": 0,
"embedding_model": "BAAI/bge-large-zh-v1.5",
"id": "8d73076886cc11ef8c270242ac120006",
"id": "527fa74891e811ef9c650242ac120006",
"language": "English",
"name": "test_1",
"parse_method": "naive",
"parser_config": {
"pages": [
[
1,
1000000
]
]
"chunk_token_num": 128,
"delimiter": "\\n!?;。;!?",
"html4excel": false,
"layout_recognize": true,
"raptor": {
"user_raptor": false
}
},
"permission": "me",
"similarity_threshold": 0.2,
"status": "1",
"tenant_id": "69736c5e723611efb51b0242ac120007",
"token_num": 0,
"update_date": "Thu, 10 Oct 2024 05:57:37 GMT",
"update_time": 1728539857641,
"update_date": "Thu, 24 Oct 2024 09:14:07 GMT",
"update_time": 1729761247434,
"vector_similarity_weight": 0.3
}
}
@ -229,9 +228,7 @@ curl --request PUT \
--header 'Authorization: Bearer {YOUR_API_KEY}' \
--data '
{
"name": "test",
"embedding_model": "BAAI/bge-zh-v1.5",
"chunk_method": "naive"
"name": "updated_dataset",
}'
```
@ -336,7 +333,7 @@ Success:
"id": "6e211ee0723611efa10a0242ac120007",
"language": "English",
"name": "mysql",
"parse_method": "knowledge_graph",
"chunk_method": "knowledge_graph",
"parser_config": {
"chunk_token_num": 8192,
"delimiter": "\\n!?;。;!?",
@ -418,7 +415,30 @@ Success:
```json
{
"code": 0
"code": 0,
"data": [
{
"chunk_method": "naive",
"created_by": "69736c5e723611efb51b0242ac120007",
"dataset_id": "527fa74891e811ef9c650242ac120006",
"id": "b330ec2e91ec11efbc510242ac120004",
"location": "1.txt",
"name": "1.txt",
"parser_config": {
"chunk_token_num": 128,
"delimiter": "\\n!?;。;!?",
"html4excel": false,
"layout_recognize": true,
"raptor": {
"user_raptor": false
}
},
"run": "UNSTART",
"size": 17966,
"thumbnail": "",
"type": "doc"
}
]
}
```
@ -552,7 +572,7 @@ curl --request GET \
Success:
```text
test_2.
This is a test to verify the file download functionality.
```
Failure:
@ -953,40 +973,54 @@ Success:
{
"code": 0,
"data": {
"chunks": [],
"doc": {
"chunk_num": 0,
"create_date": "Sun, 29 Sep 2024 03:47:29 GMT",
"create_time": 1727581649216,
"created_by": "69736c5e723611efb51b0242ac120007",
"id": "8cb781ec7e1511ef98ac0242ac120006",
"kb_id": "c7ee74067a2c11efb21c0242ac120006",
"location": "sunny_tomorrow.txt",
"name": "sunny_tomorrow.txt",
"parser_config": {
"pages": [
[
1,
1000000
]
"chunks": [
{
"available_int": 1,
"content": "This is a test content.",
"docnm_kwd": "1.txt",
"document_id": "b330ec2e91ec11efbc510242ac120004",
"id": "b48c170e90f70af998485c1065490726",
"image_id": "",
"important_keywords": "",
"positions": [
""
]
}
],
"doc": {
"chunk_count": 1,
"chunk_method": "naive",
"create_date": "Thu, 24 Oct 2024 09:45:27 GMT",
"create_time": 1729763127646,
"created_by": "69736c5e723611efb51b0242ac120007",
"dataset_id": "527fa74891e811ef9c650242ac120006",
"id": "b330ec2e91ec11efbc510242ac120004",
"location": "1.txt",
"name": "1.txt",
"parser_config": {
"chunk_token_num": 128,
"delimiter": "\\n!?;。;!?",
"html4excel": false,
"layout_recognize": true,
"raptor": {
"user_raptor": false
}
},
"parser_id": "naive",
"process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT",
"process_duation": 1435.37,
"progress": 0.0370833,
"progress_msg": "\nTask has been received.",
"run": "1",
"size": 24,
"process_begin_at": "Thu, 24 Oct 2024 09:56:44 GMT",
"process_duation": 0.54213,
"progress": 0.0,
"progress_msg": "Task dispatched...",
"run": "2",
"size": 17966,
"source_type": "local",
"status": "1",
"thumbnail": null,
"token_num": 0,
"thumbnail": "",
"token_count": 8,
"type": "doc",
"update_date": "Tue, 15 Oct 2024 10:47:46 GMT",
"update_time": 1728989266371
"update_date": "Thu, 24 Oct 2024 11:03:15 GMT",
"update_time": 1729767795721
},
"total": 0
"total": 1
}
}
```
@ -1287,29 +1321,7 @@ curl --request POST \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_API_KEY}'
--data '{
"dataset_ids": [
{
"avatar": null,
"chunk_count": 0,
"description": null,
"document_count": 0,
"embedding_model": "",
"id": "0b2cbc8c877f11ef89070242ac120005",
"language": "English",
"name": "Test_assistant",
"parse_method": "naive",
"parser_config": {
"pages": [
[
1,
1000000
]
]
},
"permission": "me",
"tenant_id": "4fb0cd625f9311efba4a0242ac120006"
}
],
"dataset_ids": ["0b2cbc8c877f11ef89070242ac120005"],
"name":"new_chat_1"
}'
```
@ -1363,49 +1375,29 @@ Success:
"code": 0,
"data": {
"avatar": "",
"create_date": "Fri, 11 Oct 2024 03:23:24 GMT",
"create_time": 1728617004635,
"create_date": "Thu, 24 Oct 2024 11:18:29 GMT",
"create_time": 1729768709023,
"dataset_ids": [
"527fa74891e811ef9c650242ac120006"
],
"description": "A helpful Assistant",
"do_refer": "1",
"id": "2ca4b22e878011ef88fe0242ac120005",
"knowledgebases": [
{
"avatar": null,
"chunk_count": 0,
"description": null,
"document_count": 0,
"embedding_model": "",
"id": "0b2cbc8c877f11ef89070242ac120005",
"language": "English",
"name": "Test_assistant",
"parse_method": "naive",
"parser_config": {
"pages": [
[
1,
1000000
]
]
},
"permission": "me",
"tenant_id": "4fb0cd625f9311efba4a0242ac120006"
}
],
"id": "b1f2f15691f911ef81180242ac120003",
"language": "English",
"llm": {
"frequency_penalty": 0.7,
"max_tokens": 512,
"model_name": "deepseek-chat___OpenAI-API@OpenAI-API-Compatible",
"model_name": "qwen-plus@Tongyi-Qianwen",
"presence_penalty": 0.4,
"temperature": 0.1,
"top_p": 0.3
},
"name": "new_chat_1",
"name": "12234",
"prompt": {
"empty_response": "Sorry! 知识库中未找到相关内容!",
"empty_response": "Sorry! No relevant content was found in the knowledge base!",
"keywords_similarity_weight": 0.3,
"opener": "您好我是您的助手小樱长得可爱又善良can I help you?",
"prompt": "你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\n 以下是知识库:\n {knowledge}\n 以上是知识库。",
"opener": "Hi! I'm your assistant, what can I do for you?",
"prompt": "You are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\n Here is the knowledge base:\n {knowledge}\n The above is the knowledge base.",
"rerank_model": "",
"similarity_threshold": 0.2,
"top_n": 6,
@ -1420,8 +1412,8 @@ Success:
"status": "1",
"tenant_id": "69736c5e723611efb51b0242ac120007",
"top_k": 1024,
"update_date": "Fri, 11 Oct 2024 03:23:24 GMT",
"update_time": 1728617004635
"update_date": "Thu, 24 Oct 2024 11:18:29 GMT",
"update_time": 1729768709023
}
}
```
@ -1636,56 +1628,27 @@ Success:
"data": [
{
"avatar": "",
"create_date": "Fri, 11 Oct 2024 03:23:24 GMT",
"create_time": 1728617004635,
"create_date": "Fri, 18 Oct 2024 06:20:06 GMT",
"create_time": 1729232406637,
"description": "A helpful Assistant",
"do_refer": "1",
"id": "2ca4b22e878011ef88fe0242ac120005",
"knowledgebases": [
{
"avatar": "",
"chunk_num": 0,
"create_date": "Fri, 11 Oct 2024 03:15:18 GMT",
"create_time": 1728616518986,
"created_by": "69736c5e723611efb51b0242ac120007",
"description": "",
"doc_num": 0,
"embd_id": "BAAI/bge-large-zh-v1.5",
"id": "0b2cbc8c877f11ef89070242ac120005",
"language": "English",
"name": "test_delete_chat",
"parser_config": {
"chunk_token_count": 128,
"delimiter": "\n!?。;!?",
"layout_recognize": true,
"task_page_size": 12
},
"parser_id": "naive",
"permission": "me",
"similarity_threshold": 0.2,
"status": "1",
"tenant_id": "69736c5e723611efb51b0242ac120007",
"token_num": 0,
"update_date": "Fri, 11 Oct 2024 04:01:31 GMT",
"update_time": 1728619291228,
"vector_similarity_weight": 0.3
}
],
"id": "04d0d8e28d1911efa3630242ac120006",
"dataset_ids": ["527fa74891e811ef9c650242ac120006"],
"language": "English",
"llm": {
"frequency_penalty": 0.7,
"max_tokens": 512,
"model_name": "deepseek-chat___OpenAI-API@OpenAI-API-Compatible",
"model_name": "qwen-plus@Tongyi-Qianwen",
"presence_penalty": 0.4,
"temperature": 0.1,
"top_p": 0.3
},
"name": "Test",
"name": "13243",
"prompt": {
"empty_response": "Sorry! 知识库中未找到相关内容!",
"empty_response": "Sorry! No relevant content was found in the knowledge base!",
"keywords_similarity_weight": 0.3,
"opener": "您好我是您的助手小樱长得可爱又善良can I help you?",
"prompt": "你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\n 以下是知识库:\n {knowledge}\n 以上是知识库。",
"opener": "Hi! I'm your assistant, what can I do for you?",
"prompt": "You are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\n Here is the knowledge base:\n {knowledge}\n The above is the knowledge base.",
"rerank_model": "",
"similarity_threshold": 0.2,
"top_n": 6,
@ -1700,8 +1663,8 @@ Success:
"status": "1",
"tenant_id": "69736c5e723611efb51b0242ac120007",
"top_k": 1024,
"update_date": "Fri, 11 Oct 2024 03:47:58 GMT",
"update_time": 1728618478392
"update_date": "Fri, 18 Oct 2024 06:20:06 GMT",
"update_time": 1729232406638
}
]
}
@ -2035,78 +1998,55 @@ Success:
data: {
"code": 0,
"data": {
"answer": "您好!有什么具体的问题或者需要的帮助",
"answer": "I am an intelligent assistant designed to help you with your inquiries. I can provide",
"reference": {},
"audio_binary": null,
"id": "31153052-7bac-4741-a513-ed07d853f29e"
"id": "d8e5ebb6-6b52-4fd1-bd02-35b52ba3acaa",
"session_id": "e14344d08d1a11efb6210242ac120004"
}
}
data: {
"code": 0,
"data": {
"answer": "您好!有什么具体的问题或者需要的帮助可以告诉我吗?我在这里是为了帮助",
"answer": "I am an intelligent assistant designed to help you with your inquiries. I can provide information, answer questions, and assist with tasks based on the knowledge available to me",
"reference": {},
"audio_binary": null,
"id": "31153052-7bac-4741-a513-ed07d853f29e"
"id": "d8e5ebb6-6b52-4fd1-bd02-35b52ba3acaa",
"session_id": "e14344d08d1a11efb6210242ac120004"
}
}
data: {
"code": 0,
"data": {
"answer": "您好!有什么具体的问题或者需要的帮助可以告诉我吗?我在这里是为了帮助您的。如果您有任何疑问或是需要获取",
"answer": "I am an intelligent assistant designed to help you with your inquiries. I can provide information, answer questions, and assist with tasks based on the knowledge available to me. How can I assist you today?",
"reference": {},
"audio_binary": null,
"id": "31153052-7bac-4741-a513-ed07d853f29e"
"id": "d8e5ebb6-6b52-4fd1-bd02-35b52ba3acaa",
"session_id": "e14344d08d1a11efb6210242ac120004"
}
}
data: {
"code": 0,
"data": {
"answer": "您好!有什么具体的问题或者需要的帮助可以告诉我吗?我在这里是为了帮助您的。如果您有任何疑问或是需要获取某些信息,请随时提出。",
"reference": {},
"audio_binary": null,
"id": "31153052-7bac-4741-a513-ed07d853f29e"
}
}
data: {
"code": 0,
"data": {
"answer": "您好!有什么具体的问题或者需要的帮助可以告诉我吗 ##0$$?我在这里是为了帮助您的。如果您有任何疑问或是需要获取某些信息,请随时提出。",
"answer": "I am an intelligent assistant designed to help you with your inquiries. I can provide information, answer questions, and assist with tasks based on the knowledge available to me ##0$$. How can I assist you today?",
"reference": {
"total": 19,
"total": 8,
"chunks": [
{
"chunk_id": "9d87f9d70a0d8a7565694a81fd4c5d5f",
"content_ltks": "当所有知识库内容都与问题无关时 ,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\r\n以下是知识库:\r\n{knowledg}\r\n以上是知识库\r\n\"\"\"\r\n 1\r\n 2\r\n 3\r\n 4\r\n 5\r\n 6\r\n总结\r\n通过上面的介绍,可以对开源的 ragflow有了一个大致的了解,与前面的有道qanyth整体流程还是比较类似的。 ",
"content_with_weight": "当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\r\n 以下是知识库:\r\n {knowledge}\r\n 以上是知识库\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n总结\r\n通过上面的介绍可以对开源的 RagFlow 有了一个大致的了解,与前面的 有道 QAnything 整体流程还是比较类似的。",
"doc_id": "5c5999ec7be811ef9cab0242ac120005",
"docnm_kwd": "1.txt",
"kb_id": "c7ee74067a2c11efb21c0242ac120006",
"important_kwd": [],
"img_id": "",
"similarity": 0.38337178633282265,
"vector_similarity": 0.3321336754679629,
"term_similarity": 0.4053309767034769,
"positions": [
""
]
},
{
"chunk_id": "895d34de762e674b43e8613c6fb54c6d",
"content_ltks": "\r\n\r\n实际内容可能会超过大模型的输入token数量,因此在调用大模型前会调用api/db/servic/dialog_service.py文件中 messag_fit_in ()根据大模型可用的 token数量进行过滤。这部分与有道的 qanyth的实现大同小异,就不额外展开了。\r\n\r\n将检索的内容,历史聊天记录以及问题构造为 prompt ,即可作为大模型的输入了 ,默认的英文prompt如下所示:\r\n\r\n\"\"\"\r\nyou are an intellig assistant. pleas summar the content of the knowledg base to answer the question. pleas list thedata in the knowledg base and answer in detail. when all knowledg base content is irrelev to the question , your answer must includ the sentenc\"the answer you are lookfor isnot found in the knowledg base!\" answer needto consid chat history.\r\n here is the knowledg base:\r\n{ knowledg}\r\nthe abov is the knowledg base.\r\n\"\"\"\r\n1\r\n 2\r\n 3\r\n 4\r\n 5\r\n 6\r\n对应的中文prompt如下所示:\r\n\r\n\"\"\"\r\n你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。 ",
"content_with_weight": "\r\n\r\n实际内容可能会超过大模型的输入 token 数量,因此在调用大模型前会调用 api/db/services/dialog_service.py 文件中 message_fit_in() 根据大模型可用的 token 数量进行过滤。这部分与有道的 QAnything 的实现大同小异,就不额外展开了。\r\n\r\n将检索的内容历史聊天记录以及问题构造为 prompt即可作为大模型的输入了默认的英文 prompt 如下所示:\r\n\r\n\"\"\"\r\nYou are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\r\n Here is the knowledge base:\r\n {knowledge}\r\n The above is the knowledge base.\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n对应的中文 prompt 如下所示:\r\n\r\n\"\"\"\r\n你是一个智能助手请总结知识库的内容来回答问题请列举知识库中的数据详细回答。",
"content_ltks": "xxxx\r\n\r\n\"\"\"\r\nyou are an intellig assistant. pleas summar the content of the knowledg base to answer the question. pleas list thedata in the knowledg base and answer in detail. when all knowledg base content is irrelev to the question , your answer must includ the sentenc\"the answer you are lookfor isnot found in the knowledg base!\" answer needto consid chat history.\r\n here is the knowledg base:\r\n{ knowledg}\r\nthe abov is the knowledg base.\r\n\"\"\"\r\n1\r\n 2\r\n 3\r\n 4\r\n 5\r\n 6\r\nxxxx ",
"content_with_weight": "xxxx\r\n\r\n\"\"\"\r\nYou are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\r\n Here is the knowledge base:\r\n {knowledge}\r\n The above is the knowledge base.\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\nxxxx\r\n\r\n\"\"\"\r\nxxxx",
"doc_id": "5c5999ec7be811ef9cab0242ac120005",
"docnm_kwd": "1.txt",
"kb_id": "c7ee74067a2c11efb21c0242ac120006",
"important_kwd": [],
"img_id": "",
"similarity": 0.2788204323926715,
"vector_similarity": 0.35489427679953667,
"term_similarity": 0.2462173562183008,
"similarity": 0.4442746624416507,
"vector_similarity": 0.3843936320913369,
"term_similarity": 0.4699379611632138,
"positions": [
""
]
@ -2116,12 +2056,13 @@ data: {
{
"doc_name": "1.txt",
"doc_id": "5c5999ec7be811ef9cab0242ac120005",
"count": 2
"count": 1
}
]
},
"prompt": "你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\n 以下是知识库:\n 当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\r\n 以下是知识库:\r\n {knowledge}\r\n 以上是知识库\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n总结\r\n通过上面的介绍可以对开源的 RagFlow 有了一个大致的了解,与前面的 有道 QAnything 整体流程还是比较类似的。\n\n------\n\n\r\n\r\n实际内容可能会超过大模型的输入 token 数量,因此在调用大模型前会调用 api/db/services/dialog_service.py 文件中 message_fit_in() 根据大模型可用的 token 数量进行过滤。这部分与有道的 QAnything 的实现大同小异,就不额外展开了。\r\n\r\n将检索的内容历史聊天记录以及问题构造为 prompt即可作为大模型的输入了默认的英文 prompt 如下所示:\r\n\r\n\"\"\"\r\nYou are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\r\n Here is the knowledge base:\r\n {knowledge}\r\n The above is the knowledge base.\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n对应的中文 prompt 如下所示:\r\n\r\n\"\"\"\r\n你是一个智能助手请总结知识库的内容来回答问题请列举知识库中的数据详细回答。\n 以上是知识库。\n\n### Query:\n你好请问有什么问题需要我帮忙解答吗\n\n### Elapsed\n - Retrieval: 9131.1 ms\n - LLM: 12802.6 ms",
"id": "31153052-7bac-4741-a513-ed07d853f29e"
"prompt": "xxxx\r\n\r\n\"\"\"\r\nYou are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\r\n Here is the knowledge base:\r\n {knowledge}\r\n The above is the knowledge base.\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\nxxxx\n\n### Query:\nwho are you,please answer me in English\n\n### Elapsed\n - Retrieval: 332.2 ms\n - LLM: 2972.1 ms",
"id": "d8e5ebb6-6b52-4fd1-bd02-35b52ba3acaa",
"session_id": "e14344d08d1a11efb6210242ac120004"
}
}

View File

@ -337,7 +337,7 @@ def valid(permission,valid_permission,language,valid_language,chunk_method,valid
def valid_parameter(parameter,valid_values):
if parameter and parameter not in valid_values:
return get_error_data_result(f"{parameter} is not in {valid_values}")
return get_error_data_result(f"`{parameter}` is not in {valid_values}")
def get_parser_config(chunk_method,parser_config):
if parser_config:

View File

@ -70,7 +70,7 @@ class Document(Base):
return Chunk(self.rag,res["data"].get("chunk"))
raise Exception(res.get("message"))
def delete_chunks(self,ids:List[str]):
def delete_chunks(self,ids:List[str] = None):
res = self.rm(f"dataset/{self.dataset_id}/document/{self.id}/chunk",{"ids":ids})
res = res.json()
if res.get("code")!=0: