From dab92ac1e8df8cefac2836589582ed625a9cb53a Mon Sep 17 00:00:00 2001 From: liuhua <10215101452@stu.ecnu.edu.cn> Date: Wed, 16 Oct 2024 18:41:24 +0800 Subject: [PATCH] Refactor Chunk API (#2855) ### What problem does this PR solve? Refactor Chunk API #2846 ### Type of change - [x] Refactoring --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Kevin Hu --- api/apps/sdk/doc.py | 167 ++++---- api/apps/sdk/session.py | 45 +- api/db/services/document_service.py | 3 +- api/http_api.md | 570 ++++++++++++++++--------- api/python_api_reference.md | 389 ++++++++--------- sdk/python/ragflow/modules/chunk.py | 31 +- sdk/python/ragflow/modules/dataset.py | 11 + sdk/python/ragflow/modules/document.py | 177 ++------ sdk/python/ragflow/modules/session.py | 5 +- sdk/python/ragflow/ragflow.py | 111 +---- sdk/python/test/t_document.py | 42 +- 11 files changed, 760 insertions(+), 791 deletions(-) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 502697e14..840994b38 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -119,13 +119,11 @@ def update_doc(tenant_id, dataset_id, document_id): if informs: e, file = FileService.get_by_id(informs[0].file_id) FileService.update_by_id(file.id, {"name": req["name"]}) + if "parser_config" in req: + DocumentService.update_parser_config(doc.id, req["parser_config"]) if "parser_method" in req: if doc.parser_id.lower() == req["parser_method"].lower(): - if "parser_config" in req: - if req["parser_config"] == doc.parser_config: - return get_result(retcode=RetCode.SUCCESS) - else: - return get_result(retcode=RetCode.SUCCESS) + return get_result() if doc.type == FileType.VISUAL or re.search( r"\.(ppt|pptx|pages)$", doc.name): @@ -146,8 +144,6 @@ def update_doc(tenant_id, dataset_id, document_id): return get_error_data_result(retmsg="Tenant not found!") ELASTICSEARCH.deleteByQuery( Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) - if "parser_config" in req: - DocumentService.update_parser_config(doc.id, req["parser_config"]) return get_result() @@ -258,6 +254,8 @@ def parse(tenant_id,dataset_id): if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") req = request.json + if not req.get("document_ids"): + return get_error_data_result("`document_ids` is required") for id in req["document_ids"]: if not DocumentService.query(id=id,kb_id=dataset_id): return get_error_data_result(retmsg=f"You don't own the document {id}.") @@ -283,9 +281,14 @@ def stop_parsing(tenant_id,dataset_id): if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") req = request.json + if not req.get("document_ids"): + return get_error_data_result("`document_ids` is required") for id in req["document_ids"]: - if not DocumentService.query(id=id,kb_id=dataset_id): + doc = DocumentService.query(id=id, kb_id=dataset_id) + if not doc: return get_error_data_result(retmsg=f"You don't own the document {id}.") + if doc[0].progress == 100.0 or doc[0].progress == 0.0: + return get_error_data_result("Can't stop parsing document with progress at 0 or 100") info = {"run": "2", "progress": 0} DocumentService.update_by_id(id, info) # if str(req["run"]) == TaskStatus.CANCEL.value: @@ -297,7 +300,7 @@ def stop_parsing(tenant_id,dataset_id): @manager.route('/dataset//document//chunk', methods=['GET']) @token_required -def list_chunk(tenant_id,dataset_id,document_id): +def list_chunks(tenant_id,dataset_id,document_id): if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") doc=DocumentService.query(id=document_id, kb_id=dataset_id) @@ -309,57 +312,58 @@ def list_chunk(tenant_id,dataset_id,document_id): page = int(req.get("offset", 1)) size = int(req.get("limit", 30)) question = req.get("keywords", "") - try: - query = { - "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True + query = { + "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True + } + sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) + res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} + origin_chunks = [] + sign = 0 + for id in sres.ids: + d = { + "chunk_id": id, + "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[ + id].get( + "content_with_weight", ""), + "doc_id": sres.field[id]["doc_id"], + "docnm_kwd": sres.field[id]["docnm_kwd"], + "important_kwd": sres.field[id].get("important_kwd", []), + "img_id": sres.field[id].get("img_id", ""), + "available_int": sres.field[id].get("available_int", 1), + "positions": sres.field[id].get("position_int", "").split("\t") } - if "available_int" in req: - query["available_int"] = int(req["available_int"]) - sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) - res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} + if len(d["positions"]) % 5 == 0: + poss = [] + for i in range(0, len(d["positions"]), 5): + poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), + float(d["positions"][i + 3]), float(d["positions"][i + 4])]) + d["positions"] = poss - origin_chunks = [] - for id in sres.ids: - d = { - "chunk_id": id, - "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[ - id].get( - "content_with_weight", ""), - "doc_id": sres.field[id]["doc_id"], - "docnm_kwd": sres.field[id]["docnm_kwd"], - "important_kwd": sres.field[id].get("important_kwd", []), - "img_id": sres.field[id].get("img_id", ""), - "available_int": sres.field[id].get("available_int", 1), - "positions": sres.field[id].get("position_int", "").split("\t") - } - if len(d["positions"]) % 5 == 0: - poss = [] - for i in range(0, len(d["positions"]), 5): - poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), - float(d["positions"][i + 3]), float(d["positions"][i + 4])]) - d["positions"] = poss + origin_chunks.append(d) + if req.get("id"): + if req.get("id") == id: + origin_chunks.clear() + origin_chunks.append(d) + sign = 1 + break + if req.get("id"): + if sign == 0: + return get_error_data_result(f"Can't find this chunk {req.get('id')}") + for chunk in origin_chunks: + key_mapping = { + "chunk_id": "id", + "content_with_weight": "content", + "doc_id": "document_id", + "important_kwd": "important_keywords", + "img_id": "image_id", + } + renamed_chunk = {} + for key, value in chunk.items(): + new_key = key_mapping.get(key, key) + renamed_chunk[new_key] = value + res["chunks"].append(renamed_chunk) + return get_result(data=res) - origin_chunks.append(d) - ##rename keys - for chunk in origin_chunks: - key_mapping = { - "chunk_id": "id", - "content_with_weight": "content", - "doc_id": "document_id", - "important_kwd": "important_keywords", - "img_id": "image_id", - } - renamed_chunk = {} - for key, value in chunk.items(): - new_key = key_mapping.get(key, key) - renamed_chunk[new_key] = value - res["chunks"].append(renamed_chunk) - return get_result(data=res) - except Exception as e: - if str(e).find("not_found") > 0: - return get_result(retmsg=f'No chunk found!', - retcode=RetCode.DATA_ERROR) - return server_error_response(e) @manager.route('/dataset//document//chunk', methods=['POST']) @@ -374,6 +378,9 @@ def create(tenant_id,dataset_id,document_id): req = request.json if not req.get("content"): return get_error_data_result(retmsg="`content` is required") + if "important_keywords" in req: + if type(req["important_keywords"]) != list: + return get_error_data_result("`important_keywords` is required to be a list") md5 = hashlib.md5() md5.update((req["content"] + document_id).encode("utf-8")) @@ -381,8 +388,8 @@ def create(tenant_id,dataset_id,document_id): d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]), "content_with_weight": req["content"]} d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - d["important_kwd"] = req.get("important_kwd", []) - d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) + d["important_kwd"] = req.get("important_keywords", []) + d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", []))) d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.datetime.now().timestamp() d["kb_id"] = [doc.kb_id] @@ -432,12 +439,12 @@ def rm_chunk(tenant_id,dataset_id,document_id): req = request.json if not req.get("chunk_ids"): return get_error_data_result("`chunk_ids` is required") + query = { + "doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True} + sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) for chunk_id in req.get("chunk_ids"): - res = ELASTICSEARCH.get( - chunk_id, search.index_name( - tenant_id)) - if not res.get("found"): - return server_error_response(f"Chunk {chunk_id} not found") + if chunk_id not in sres.ids: + return get_error_data_result(f"Chunk {chunk_id} not found") if not ELASTICSEARCH.deleteByQuery( Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)): return get_error_data_result(retmsg="Index updating failure") @@ -451,24 +458,36 @@ def rm_chunk(tenant_id,dataset_id,document_id): @manager.route('/dataset//document//chunk/', methods=['PUT']) @token_required def set(tenant_id,dataset_id,document_id,chunk_id): - res = ELASTICSEARCH.get( + try: + res = ELASTICSEARCH.get( chunk_id, search.index_name( tenant_id)) - if not res.get("found"): - return get_error_data_result(f"Chunk {chunk_id} not found") + except Exception as e: + return get_error_data_result(f"Can't find this chunk {chunk_id}") if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") doc = DocumentService.query(id=document_id, kb_id=dataset_id) if not doc: return get_error_data_result(retmsg=f"You don't own the document {document_id}.") + doc = doc[0] + query = { + "doc_ids": [document_id], "page": 1, "size": 1024, "question": "", "sort": True + } + sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) + if chunk_id not in sres.ids: + return get_error_data_result(f"You don't own the chunk {chunk_id}") req = request.json + content=res["_source"].get("content_with_weight") d = { "id": chunk_id, - "content_with_weight": req.get("content",res.get["content_with_weight"])} - d["content_ltks"] = rag_tokenizer.tokenize(req["content"]) + "content_with_weight": req.get("content",content)} + d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - d["important_kwd"] = req.get("important_keywords",[]) - d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) + if "important_keywords" in req: + if type(req["important_keywords"]) != list: + return get_error_data_result("`important_keywords` is required to be a list") + d["important_kwd"] = req.get("important_keywords") + d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) if "available" in req: d["available_int"] = req["available"] embd_id = DocumentService.get_embd_id(document_id) @@ -478,7 +497,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id): arr = [ t for t in re.split( r"[\n\t]", - req["content"]) if len(t) > 1] + d["content_with_weight"]) if len(t) > 1] if len(arr) != 2: return get_error_data_result( retmsg="Q&A must be separated by TAB/ENTER key.") @@ -486,7 +505,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id): d = beAdoc(d, arr[0], arr[1], not any( [rag_tokenizer.is_chinese(t) for t in q + a])) - v, c = embd_mdl.encode([doc.name, req["content"]]) + v, c = embd_mdl.encode([doc.name, d["content_with_weight"]]) v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] d["q_%d_vec" % len(v)] = v.tolist() ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) @@ -505,7 +524,7 @@ def retrieval_test(tenant_id): for id in kb_id: if not KnowledgebaseService.query(id=id,tenant_id=tenant_id): return get_error_data_result(f"You don't own the dataset {id}.") - if "question" not in req_json: + if "question" not in req: return get_error_data_result("`question` is required.") page = int(req.get("offset", 1)) size = int(req.get("limit", 30)) diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py index 81bc84aca..897fdf2da 100644 --- a/api/apps/sdk/session.py +++ b/api/apps/sdk/session.py @@ -24,10 +24,9 @@ from api.utils import get_uuid from api.utils.api_utils import get_error_data_result from api.utils.api_utils import get_result, token_required - @manager.route('/chat//session', methods=['POST']) @token_required -def create(tenant_id, chat_id): +def create(tenant_id,chat_id): req = request.json req["dialog_id"] = chat_id dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value) @@ -51,14 +50,13 @@ def create(tenant_id, chat_id): del conv["reference"] return get_result(data=conv) - @manager.route('/chat//session/', methods=['PUT']) @token_required -def update(tenant_id, chat_id, session_id): +def update(tenant_id,chat_id,session_id): req = request.json req["dialog_id"] = chat_id conv_id = session_id - conv = ConversationService.query(id=conv_id, dialog_id=chat_id) + conv = ConversationService.query(id=conv_id,dialog_id=chat_id) if not conv: return get_error_data_result(retmsg="Session does not exist") if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value): @@ -74,16 +72,30 @@ def update(tenant_id, chat_id, session_id): return get_result() -@manager.route('/chat//session//completion', methods=['POST']) +@manager.route('/chat//completion', methods=['POST']) @token_required -def completion(tenant_id, chat_id, session_id): +def completion(tenant_id,chat_id): req = request.json # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [ # {"role": "user", "content": "上海有吗?"} # ]} + if not req.get("session_id"): + conv = { + "id": get_uuid(), + "dialog_id": chat_id, + "name": req.get("name", "New session"), + "message": [{"role": "assistant", "content": "Hi! I am your assistant,can I help you?"}] + } + if not conv.get("name"): + return get_error_data_result(retmsg="Name can not be empty.") + ConversationService.save(**conv) + e, conv = ConversationService.get_by_id(conv["id"]) + session_id=conv.id + else: + session_id = req.get("session_id") if not req.get("question"): return get_error_data_result(retmsg="Please input your question.") - conv = ConversationService.query(id=session_id, dialog_id=chat_id) + conv = ConversationService.query(id=session_id,dialog_id=chat_id) if not conv: return get_error_data_result(retmsg="Session does not exist") conv = conv[0] @@ -117,17 +129,18 @@ def completion(tenant_id, chat_id, session_id): conv.message[-1] = {"role": "assistant", "content": ans["answer"], "id": message_id, "prompt": ans.get("prompt", "")} ans["id"] = message_id + ans["session_id"]=session_id def stream(): nonlocal dia, msg, req, conv try: for ans in chat(dia, msg, **req): fillin_conv(ans) - yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n" + yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n" ConversationService.update_by_id(conv.id, conv.to_dict()) except Exception as e: yield "data:" + json.dumps({"code": 500, "message": str(e), - "data": {"answer": "**ERROR**: " + str(e), "reference": []}}, + "data": {"answer": "**ERROR**: " + str(e),"reference": []}}, ensure_ascii=False) + "\n\n" yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n" @@ -148,15 +161,14 @@ def completion(tenant_id, chat_id, session_id): break return get_result(data=answer) - @manager.route('/chat//session', methods=['GET']) @token_required -def list(chat_id, tenant_id): +def list(chat_id,tenant_id): if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value): return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.") id = request.args.get("id") name = request.args.get("name") - session = ConversationService.query(id=id, name=name, dialog_id=chat_id) + session = ConversationService.query(id=id,name=name,dialog_id=chat_id) if not session: return get_error_data_result(retmsg="The session doesn't exist") page_number = int(request.args.get("page", 1)) @@ -166,7 +178,7 @@ def list(chat_id, tenant_id): desc = False else: desc = True - convs = ConversationService.get_list(chat_id, page_number, items_per_page, orderby, desc, id, name) + convs = ConversationService.get_list(chat_id,page_number,items_per_page,orderby,desc,id,name) if not convs: return get_result(data=[]) for conv in convs: @@ -201,17 +213,16 @@ def list(chat_id, tenant_id): del conv["reference"] return get_result(data=convs) - @manager.route('/chat//session', methods=["DELETE"]) @token_required -def delete(tenant_id, chat_id): +def delete(tenant_id,chat_id): if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value): return get_error_data_result(retmsg="You don't own the chat") ids = request.json.get("ids") if not ids: return get_error_data_result(retmsg="`ids` is required in deleting operation") for id in ids: - conv = ConversationService.query(id=id, dialog_id=chat_id) + conv = ConversationService.query(id=id,dialog_id=chat_id) if not conv: return get_error_data_result(retmsg="The chat doesn't own the session") ConversationService.delete_by_id(id) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index f839dc185..40fd1188e 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -61,14 +61,13 @@ class DocumentService(CommonService): docs = docs.where( fn.LOWER(cls.model.name).contains(keywords.lower()) ) - count = docs.count() if desc: docs = docs.order_by(cls.model.getter_by(orderby).desc()) else: docs = docs.order_by(cls.model.getter_by(orderby).asc()) docs = docs.paginate(page_number, items_per_page) - + count = docs.count() return list(docs.dicts()), count diff --git a/api/http_api.md b/api/http_api.md index 87431c5df..99c8363e6 100644 --- a/api/http_api.md +++ b/api/http_api.md @@ -432,18 +432,71 @@ The error response includes a JSON object like the following: } ``` +## Delete files from a dataset + +**DELETE** `/api/v1/dataset/{dataset_id}/document ` + +Delete files from a dataset + +### Request + +- Method: DELETE +- URL: `http://{address}/api/v1/dataset/{dataset_id}/document` +- Headers: + - 'Content-Type: application/json' + - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' +- Body: + - `ids`:List[str] +#### Request example + +```bash +curl --request DELETE \ + --url http://{address}/api/v1/dataset/{dataset_id}/document \ + --header 'Content-Type: application/json' \ + --header 'Authorization: {YOUR ACCESS TOKEN}' \ + --data '{ + "ids": ["id_1","id_2"] + }' +``` + +#### Request parameters + +- `"ids"`: (*Body parameter*) + The ids of teh documents to be deleted +### Response + +The successful response includes a JSON object like the following: + +```json +{ + "code": 0 +}. +``` + +- `"error_code"`: `integer` + `0`: The operation succeeds. + + +The error response includes a JSON object like the following: + +```json +{ + "code": 102, + "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005." +} +``` + ## Download a file from a dataset **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}` -Downloads files from a dataset. +Downloads a file from a dataset. ### Request - Method: GET -- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}` +- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}` - Headers: - - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - Output: - '{FILE_NAME}' @@ -451,10 +504,9 @@ Downloads files from a dataset. ```bash curl --request GET \ - --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --output '{FILE_NAME}' + --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ + --output ./ragflow.txt ``` #### Request parameters @@ -466,7 +518,7 @@ curl --request GET \ ### Response -The successful response includes a JSON object like the following: +The successful response includes a text object like the following: ```text test_2. @@ -596,92 +648,39 @@ Update a file in a dataset - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - +- Body: + - `name`:`string` + - `parser_method`:`string` + - `parser_config`:`dict` #### Request example ```bash curl --request PUT \ - --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \ + --url http://{address}/api/v1/dataset/{dataset_id}/info/{document_id} \ --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \ --header 'Content-Type: application/json' \ --data '{ "name": "manual.txt", - "thumbnail": null, - "knowledgebase_id": "779333c0758611ef910f0242ac120004", "parser_method": "manual", - "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12}, - "source_type": "local", "type": "doc", - "created_by": "134408906b6811efbcd20242ac120005", - "size": 0, "token_count": 0, "chunk_count": 0, - "progress": 0.0, - "progress_msg": "", - "process_begin_at": null, - "process_duration": 0.0 + "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12} }' ``` #### Request parameters -- `"thumbnail"`: (*Body parameter*) - Thumbnail image of the document. - - `""` - -- `"knowledgebase_id"`: (*Body parameter*) - Knowledge base ID related to the document. - - `""` - - `"parser_method"`: (*Body parameter*) Method used to parse the document. - - `""` + - `"parser_config"`: (*Body parameter*) Configuration object for the parser. - If the value is `None`, a dictionary with default values will be generated. -- `"source_type"`: (*Body parameter*) - Source type of the document. - - `""` - -- `"type"`: (*Body parameter*) - Type or category of the document. - - `""` - -- `"created_by"`: (*Body parameter*) - Creator of the document. - - `""` - - `"name"`: (*Body parameter*) Name or title of the document. - - `""` -- `"size"`: (*Body parameter*) - Size of the document in bytes or some other unit. - - `0` -- `"token_count"`: (*Body parameter*) - Number of tokens in the document. - - `0` - -- `"chunk_count"`: (*Body parameter*) - Number of chunks the document is split into. - - `0` - -- `"progress"`: (*Body parameter*) - Current processing progress as a percentage. - - `0.0` - -- `"progress_msg"`: (*Body parameter*) - Message indicating current progress status. - - `""` - -- `"process_begin_at"`: (*Body parameter*) - Start time of the document processing. - - `None` - -- `"process_duration"`: (*Body parameter*) - Duration of the processing in seconds or minutes. - - `0.0` ### Response @@ -712,34 +711,34 @@ Parse files into chunks in a dataset ### Request - Method: POST -- URL: `/api/v1/dataset/{dataset_id}/chunk` +- URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk ` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' +- Body: + - `document_ids`:List[str] #### Request example -```shell +```bash curl --request POST \ - --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --raw '{ - "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] - }' + --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ + --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}' ``` #### Request parameters - `"dataset_id"`: (*Path parameter*) -- `"documents"`: (*Body parameter*) - - Documents to parse +- `"document_ids"`:(*Body parameter*) + The ids of the documents to be parsed ### Response The successful response includes a JSON object like the following: -```shell +```json { "code": 0 } @@ -747,10 +746,10 @@ The successful response includes a JSON object like the following: The error response includes a JSON object like the following: -```shell +```json { - "code": 3016, - "message": "Can't connect database" + "code": 102, + "message": "`document_ids` is required" } ``` @@ -762,35 +761,35 @@ Stop file parsing ### Request -- Method: POST -- URL: `/api/v1/dataset/{dataset_id}/chunk` +- Method: DELETE +- URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - +- Body: + - `document_ids`:List[str] #### Request example -```shell +```bash curl --request DELETE \ - --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --raw '{ - "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] - }' + --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ + --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}' ``` #### Request parameters - `"dataset_id"`: (*Path parameter*) -- `"documents"`: (*Body parameter*) - - Documents to stop parsing +- `"document_ids"`:(*Body parameter*) + The ids of the documents to be parsed + ### Response The successful response includes a JSON object like the following: -```shell +```json { "code": 0 } @@ -798,104 +797,98 @@ The successful response includes a JSON object like the following: The error response includes a JSON object like the following: -```shell +```json { - "code": 3016, - "message": "Can't connect database" + "code": 102, + "message": "`document_ids` is required" } ``` ## Get document chunk list -**GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` +**GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}` Get document chunk list ### Request - Method: GET -- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` +- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}` - Headers: - - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' #### Request example -```shell +```bash curl --request GET \ - --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' + --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id} \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' ``` #### Request parameters - `"dataset_id"`: (*Path parameter*) - `"document_id"`: (*Path parameter*) - +- `"offset"`(*Filter parameter*) + The beginning number of records for paging. +- `"keywords"`(*Filter parameter*) + List chunks whose name has the given keywords +- `"limit"`(*Filter parameter*) + Records number to return +- `"id"`(*Filter parameter*) + The id of chunk to be got ### Response The successful response includes a JSON object like the following: -```shell +```json { - "code": 0 + "code": 0, "data": { - "chunks": [ - { - "available_int": 1, - "content": "advantagof ragflow increas accuraci and relev:by incorpor retriev inform , ragflow can gener respons that are more accur", - "document_keyword": "ragflow_test.txt", - "document_id": "77df9ef4759a11ef8bdd0242ac120004", - "id": "4ab8c77cfac1a829c8d5ed022a0808c0", - "image_id": "", - "important_keywords": [], - "positions": [ - "" - ] - } - ], + "chunks": [], "doc": { - "chunk_count": 5, - "create_date": "Wed, 18 Sep 2024 08:46:16 GMT", - "create_time": 1726649176833, - "created_by": "134408906b6811efbcd20242ac120005", - "id": "77df9ef4759a11ef8bdd0242ac120004", - "knowledgebase_id": "77d9d24e759a11ef880c0242ac120004", - "location": "ragflow_test.txt", - "name": "ragflow_test.txt", + "chunk_num": 0, + "create_date": "Sun, 29 Sep 2024 03:47:29 GMT", + "create_time": 1727581649216, + "created_by": "69736c5e723611efb51b0242ac120007", + "id": "8cb781ec7e1511ef98ac0242ac120006", + "kb_id": "c7ee74067a2c11efb21c0242ac120006", + "location": "明天的天气是晴天.txt", + "name": "明天的天气是晴天.txt", "parser_config": { - "chunk_token_count": 128, - "delimiter": "\n!?。;!?", - "layout_recognize": true, - "task_page_size": 12 + "pages": [ + [ + 1, + 1000000 + ] + ] }, - "parser_method": "naive", - "process_begin_at": "Wed, 18 Sep 2024 08:46:16 GMT", - "process_duation": 7.3213, - "progress": 1.0, - "progress_msg": "\nTask has been received.\nStart to parse.\nFinish parsing.\nFinished slicing files(5). Start to embedding the content.\nFinished embedding(6.16)! Start to build index!\nDone!", - "run": "3", - "size": 4209, + "parser_id": "naive", + "process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT", + "process_duation": 1435.37, + "progress": 0.0370833, + "progress_msg": "\nTask has been received.", + "run": "1", + "size": 24, "source_type": "local", "status": "1", "thumbnail": null, - "token_count": 746, + "token_num": 0, "type": "doc", - "update_date": "Wed, 18 Sep 2024 08:46:23 GMT", - "update_time": 1726649183321 + "update_date": "Tue, 15 Oct 2024 10:47:46 GMT", + "update_time": 1728989266371 }, - "total": 1 - }, + "total": 0 + } } ``` The error response includes a JSON object like the following: -```shell +```json { - "code": 3016, - "message": "Can't connect database" + "code": 102, + "message": "You don't own the document 5c5999ec7be811ef9cab0242ac12000e5." } ``` @@ -908,55 +901,96 @@ Delete document chunks ### Request - Method: DELETE -- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` +- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' +- Body: + - `chunk_ids`:List[str] #### Request example -```shell +```bash curl --request DELETE \ - --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --raw '{ - "chunks": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] - }' + --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ + --data '{ + "chunk_ids": ["test_1", "test_2"] + }' ``` +#### Request parameters + +- `"chunk_ids"`:(*Body parameter*) + The chunks of the document to be deleted + +### Response +Success +```json +{ + "code": 0 +} +``` +Error +```json +{ + "code": 102, + "message": "`chunk_ids` is required" +} +``` + ## Update document chunk -**PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` +**PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}` Update document chunk ### Request - Method: PUT -- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` +- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - +- Body: + - `content`:str + - `important_keywords`:str + - `available`:int #### Request example -```shell +```bash curl --request PUT \ - --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --raw '{ - "chunk_id": "d87fb0b7212c15c18d0831677552d7de", - "knowledgebase_id": null, - "name": "", - "content": "ragflow123", - "important_keywords": [], - "document_id": "e6bbba92759511efaa900242ac120004", - "status": "1" - }' + --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id} \ + --header 'Content-Type: application/json' \ + --header 'Authorization: {YOUR_ACCESS_TOKEN}' \ + --data '{ + "content": "ragflow123", + "important_keywords": [], +}' ``` +#### Request parameters +- `"content"`:(*Body parameter*) + Contains the main text or information of the chunk. +- `"important_keywords"`:(*Body parameter*) + list the key terms or phrases that are significant or central to the chunk's content. +- `"available"`:(*Body parameter*) + Indicating the availability status, 0 means unavailable and 1 means available. +### Response +Success +```json +{ + "code": 0 +} +``` +Error +```json +{ + "code": 102, + "message": "Can't find this chunk 29a2d9987e16ba331fb4d7d30d99b71d2" +} +``` ## Insert document chunks **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` @@ -966,50 +1000,187 @@ Insert document chunks ### Request - Method: POST -- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` +- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - +- Body: + - `content`: str + - `important_keywords`:List[str] #### Request example -```shell +```bash curl --request POST \ - --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --raw '{ - "document_id": "97ad64b6759811ef9fc30242ac120004", - "content": ["ragflow content", "ragflow content"] - }' + --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ + --data '{ + "content": "ragflow content" +}' +``` +#### Request parameters +- `content`:(*Body parameter*) + Contains the main text or information of the chunk. +- `important_keywords`(*Body parameter*) + list the key terms or phrases that are significant or central to the chunk's content. + +### Response +Success +```json +{ + "code": 0, + "data": { + "chunk": { + "content": "ragflow content", + "create_time": "2024-10-16 08:05:04", + "create_timestamp": 1729065904.581025, + "dataset_id": [ + "c7ee74067a2c11efb21c0242ac120006" + ], + "document_id": "5c5999ec7be811ef9cab0242ac120005", + "id": "d78435d142bd5cf6704da62c778795c5", + "important_keywords": [] + } + } +} ``` +Error +```json +{ + "code": 102, + "message": "`content` is required" +} +``` ## Dataset retrieval test -**GET** `/api/v1/dataset/{dataset_id}/retrieval` +**GET** `/api/v1/retrieval` Retrieval test of a dataset ### Request -- Method: GET -- URL: `/api/v1/dataset/{dataset_id}/retrieval` +- Method: POST +- URL: `http://{address}/api/v1/retrieval` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - +- Body: + - `question`: str + - `datasets`: List[str] + - `documents`: List[str] + - `offset`: int + - `limit`: int + - `similarity_threshold`: float + - `vector_similarity_weight`: float + - `top_k`: int + - `rerank_id`: string + - `keyword`: bool + - `highlight`: bool #### Request example -```shell -curl --request GET \ - --url http://{address}/api/v1/dataset/{dataset_id}/retrieval \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --raw '{ - "query_text": "This is a cat." - }' +```bash +curl --request POST \ + --url http://{address}/api/v1/retrieval \ + --header 'Content-Type: application/json' \ + --header 'Authorization: {YOUR_ACCESS_TOKEN}' \ + --data '{ + "question": "What is advantage of ragflow?", + "datasets": [ + "b2a62730759d11ef987d0242ac120004" + ], + "documents": [ + "77df9ef4759a11ef8bdd0242ac120004" + ] +}' ``` +#### Request parameter +- `"question"`: (*Body parameter*) + User's question, search keywords + `""` +- `"datasets"`: (*Body parameter*) + The scope of datasets + `None` +- `"documents"`: (*Body parameter*) + The scope of document. `None` means no limitation + `None` +- `"offset"`: (*Body parameter*) + The beginning point of retrieved records + `1` + +- `"limit"`: (*Body parameter*) + The maximum number of records needed to return + `30` + +- `"similarity_threshold"`: (*Body parameter*) + The minimum similarity score + `0.2` + +- `"vector_similarity_weight"`: (*Body parameter*) + The weight of vector cosine similarity, `1 - x` is the term similarity weight + `0.3` + +- `"top_k"`: (*Body parameter*) + Number of records engaged in vector cosine computation + `1024` + +- `"rerank_id"`: (*Body parameter*) + ID of the rerank model + `None` + +- `"keyword"`: (*Body parameter*) + Whether keyword-based matching is enabled + `False` + +- `"highlight"`: (*Body parameter*) + Whether to enable highlighting of matched terms in the results + `False` +### Response +Success +```json +{ + "code": 0, + "data": { + "chunks": [ + { + "content": "ragflow content", + "content_ltks": "ragflow content", + "document_id": "5c5999ec7be811ef9cab0242ac120005", + "document_keyword": "1.txt", + "highlight": "ragflow content", + "id": "d78435d142bd5cf6704da62c778795c5", + "img_id": "", + "important_keywords": [ + "" + ], + "kb_id": "c7ee74067a2c11efb21c0242ac120006", + "positions": [ + "" + ], + "similarity": 0.9669436601210759, + "term_similarity": 1.0, + "vector_similarity": 0.8898122004035864 + } + ], + "doc_aggs": [ + { + "count": 1, + "doc_id": "5c5999ec7be811ef9cab0242ac120005", + "doc_name": "1.txt" + } + ], + "total": 1 + } +} +``` +Error +```json +{ + "code": 102, + "message": "`datasets` is required." +} +``` ## Create chat **POST** `/api/v1/chat` @@ -1708,26 +1879,27 @@ Error ## Chat with a chat session -**POST** `/api/v1/chat/{chat_id}/session/{session_id}/completion` +**POST** `/api/v1/chat/{chat_id}/completion` Chat with a chat session ### Request - Method: POST -- URL: `http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion` +- URL: `http://{address} /api/v1/chat/{chat_id}/completion` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - Body: - `question`: string - `stream`: bool + - `session_id`: str #### Request example ```bash curl --request POST \ - --url http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion \ + --url http://{address} /api/v1/chat/{chat_id}/completion \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ --data-binary '{ @@ -1743,6 +1915,8 @@ curl --request POST \ - `stream`: (*Body Parameter*) The approach of streaming text generation. `False` +- `session_id`: (*Body Parameter*) + The id of session.If not provided, a new session will be generated. ### Response Success ```json diff --git a/api/python_api_reference.md b/api/python_api_reference.md index 60c2d2e4d..56056cde1 100644 --- a/api/python_api_reference.md +++ b/api/python_api_reference.md @@ -244,42 +244,117 @@ File management inside knowledge base ## Upload document ```python -RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool +DataSet.upload_documents(document_list: List[dict]) ``` ### Parameters -#### name - -#### blob - +#### document_list:`List[dict]` +A list composed of dicts containing `name` and `blob`. ### Returns +no return +### Examples +```python +from ragflow import RAGFlow + +rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") +ds = rag.create_dataset(name="kb_1") +ds.upload_documents([{name="1.txt", blob="123"}, ...] } +``` +--- + +## Update document + +```python +Document.update(update_message:dict) +``` + +### Parameters + +#### update_message:`dict` +only `name`,`parser_config`,`parser_method` can be changed + +### Returns + +no return ### Examples +```python +from ragflow import RAGFlow + +rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") +ds=rag.list_datasets(id='id') +ds=ds[0] +doc = ds.list_documents(id="wdfxb5t547d") +doc = doc[0] +doc.update([{"parser_method": "manual"...}]) +``` + --- -## Retrieve document +## Download document ```python -RAGFlow.get_document(id:str=None,name:str=None) -> Document +Document.download() -> bytes +``` + +### Returns + +bytes of the document. + +### Examples + +```python +from ragflow import RAGFlow + +rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") +ds=rag.list_datasets(id="id") +ds=ds[0] +doc = ds.list_documents(id="wdfxb5t547d") +doc = doc[0] +open("~/ragflow.txt", "wb+").write(doc.download()) +print(doc) +``` + +--- + +## List documents + +```python +Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document] ``` ### Parameters -#### id: `str`, *Required* +#### id: `str` -ID of the document to retrieve. +The id of the document to be got -#### name: `str` +#### keywords: `str` -Name or title of the document. +List documents whose name has the given keywords. Defaults to `None`. +#### offset: `int` + +The beginning number of records for paging. Defaults to `0`. + +#### limit: `int` + +Records number to return, -1 means all of them. Records number to return, -1 means all of them. + +#### orderby: `str` +The field by which the records should be sorted. This specifies the attribute or column used to order the results. + +#### desc:`bool` +A boolean flag indicating whether the sorting should be in descending order. ### Returns +List[Document] + A document object containing the following attributes: #### id: `str` @@ -352,98 +427,14 @@ Duration of the processing in seconds or minutes. Defaults to `0.0`. ```python from ragflow import RAGFlow -rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt') -print(doc) -``` - ---- - -## Save document settings - -```python -Document.save() -> bool -``` - -### Returns - -bool - -### Examples - -```python -from ragflow import RAGFlow - -rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -doc = rag.get_document(id="wdfxb5t547d") -doc.parser_method= "manual" -doc.save() -``` - ---- - -## Download document - -```python -Document.download() -> bytes -``` - -### Returns - -bytes of the document. - -### Examples - -```python -from ragflow import RAGFlow - -rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -doc = rag.get_document(id="wdfxb5t547d") -open("~/ragflow.txt", "w+").write(doc.download()) -print(doc) -``` - ---- - -## List documents - -```python -Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document] -``` - -### Parameters - -#### keywords: `str` - -List documents whose name has the given keywords. Defaults to `None`. - -#### offset: `int` - -The beginning number of records for paging. Defaults to `0`. - -#### limit: `int` - -Records number to return, -1 means all of them. Records number to return, -1 means all of them. - -### Returns - -List[Document] - -### Examples - -```python -from ragflow import RAGFlow - rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") ds = rag.create_dataset(name="kb_1") filename1 = "~/ragflow.txt" -rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read()) - -filename2 = "~/infinity.txt" -rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read()) - -for d in ds.list_docs(keywords="rag", offset=0, limit=12): +blob=open(filename1 , "rb").read() +list_files=[{"name":filename1,"blob":blob}] +ds.upload_documents(list_files) +for d in ds.list_documents(keywords="rag", offset=0, limit=12): print(d) ``` @@ -452,12 +443,11 @@ for d in ds.list_docs(keywords="rag", offset=0, limit=12): ## Delete documents ```python -Document.delete() -> bool +DataSet.delete_documents(ids: List[str] = None) ``` ### Returns -bool -description: delete success or not +no return ### Examples @@ -465,119 +455,87 @@ description: delete success or not from ragflow import RAGFlow rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -ds = rag.create_dataset(name="kb_1") - -filename1 = "~/ragflow.txt" -rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read()) - -filename2 = "~/infinity.txt" -rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read()) -for d in ds.list_docs(keywords="rag", offset=0, limit=12): - d.delete() +ds = rag.list_datasets(name="kb_1") +ds = ds[0] +ds.delete_documents(ids=["id_1","id_2"]) ``` --- -## Parse document +## Parse and stop parsing document ```python -Document.async_parse() -> None -RAGFLOW.async_parse_documents() -> None +DataSet.async_parse_documents(document_ids:List[str]) -> None +DataSet.async_cancel_parse_documents(document_ids:List[str])-> None ``` ### Parameters +#### document_ids:`List[str]` +The ids of the documents to be parsed ???????????????????????????????????????????????????? ### Returns - +no return ???????????????????????????????????????????????????? ### Examples -```python -#document parse and cancel -rag = RAGFlow(API_KEY, HOST_ADDRESS) -ds = rag.create_dataset(name="dataset_name") -name3 = 'ai.pdf' -path = 'test_data/ai.pdf' -rag.create_document(ds, name=name3, blob=open(path, "rb").read()) -doc = rag.get_document(name="ai.pdf") -doc.async_parse() -print("Async parsing initiated") -``` - ---- - -## Cancel document parsing - -```python -rag.async_cancel_parse_documents(ids) -RAGFLOW.async_cancel_parse_documents()-> None -``` - -### Parameters - -#### ids, `list[]` - -### Returns - -????????????????????????????????????????????????? - -### Examples - ```python #documents parse and cancel rag = RAGFlow(API_KEY, HOST_ADDRESS) ds = rag.create_dataset(name="God5") documents = [ - {'name': 'test1.txt', 'path': 'test_data/test1.txt'}, - {'name': 'test2.txt', 'path': 'test_data/test2.txt'}, - {'name': 'test3.txt', 'path': 'test_data/test3.txt'} + {'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()}, + {'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()}, + {'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()} ] - -# Create documents in bulk -for doc_info in documents: - with open(doc_info['path'], "rb") as file: - created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read()) -docs = [rag.get_document(name=doc_info['name']) for doc_info in documents] -ids = [doc.id for doc in docs] - -rag.async_parse_documents(ids) +ds.upload_documents(documents) +documents=ds.list_documents(keywords="test") +ids=[] +for document in documents: + ids.append(document.id) +ds.async_parse_documents(ids) print("Async bulk parsing initiated") - -for doc in docs: - for progress, msg in doc.join(interval=5, timeout=10): - print(f"{doc.name}: Progress: {progress}, Message: {msg}") - -cancel_result = rag.async_cancel_parse_documents(ids) +ds.async_cancel_parse_documents(ids) print("Async bulk parsing cancelled") ``` ---- - -## Join document - -?????????????????? - +## List chunks ```python -Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]] +Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk] ``` - ### Parameters -#### interval: `int` +- `keywords`: `str` + List chunks whose name has the given keywords + default: `None` -Time interval in seconds for progress report. Defaults to `15`. +- `offset`: `int` + The beginning number of records for paging + default: `1` -#### timeout: `int` - -Timeout in seconds. Defaults to `3600`. +- `limit`: `int` + Records number to return + default: `30` +- `id`: `str` + The ID of the chunk to be retrieved + default: `None` ### Returns +List[chunk] -iteral[Tuple[float, str]] +### Examples +```python +from ragflow import RAGFlow +rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") +ds = rag.list_datasets("123") +ds = ds[0] +ds.async_parse_documents(["wdfxb5t547d"]) +for c in doc.list_chunks(keywords="rag", offset=0, limit=12): + print(c) +``` ## Add chunk ```python @@ -587,6 +545,9 @@ Document.add_chunk(content:str) -> Chunk ### Parameters #### content: `str`, *Required* +Contains the main text or information of the chunk. +#### important_keywords :`List[str]` +list the key terms or phrases that are significant or central to the chunk's content. ### Returns @@ -598,7 +559,10 @@ chunk from ragflow import RAGFlow rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -doc = rag.get_document(id="wdfxb5t547d") +ds = rag.list_datasets(id="123") +ds = ds[0] +doc = ds.list_documents(id="wdfxb5t547d") +doc = doc[0] chunk = doc.add_chunk(content="xxxxxxx") ``` @@ -607,12 +571,15 @@ chunk = doc.add_chunk(content="xxxxxxx") ## Delete chunk ```python -Chunk.delete() -> bool +Document.delete_chunks(chunk_ids: List[str]) ``` +### Parameters +#### chunk_ids:`List[str]` +The list of chunk_id ### Returns -bool +no return ### Examples @@ -620,22 +587,34 @@ bool from ragflow import RAGFlow rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -doc = rag.get_document(id="wdfxb5t547d") +ds = rag.list_datasets(id="123") +ds = ds[0] +doc = ds.list_documents(id="wdfxb5t547d") +doc = doc[0] chunk = doc.add_chunk(content="xxxxxxx") -chunk.delete() +doc.delete_chunks(["id_1","id_2"]) ``` --- -## Save chunk contents +## Update chunk ```python -Chunk.save() -> bool +Chunk.update(update_message: dict) ``` +### Parameters +- `content`: `str` + Contains the main text or information of the chunk + +- `important_keywords`: `List[str]` + List the key terms or phrases that are significant or central to the chunk's content + +- `available`: `int` + Indicating the availability status, `0` means unavailable and `1` means available ### Returns -bool +no return ### Examples @@ -643,10 +622,12 @@ bool from ragflow import RAGFlow rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -doc = rag.get_document(id="wdfxb5t547d") +ds = rag.list_datasets(id="123") +ds = ds[0] +doc = ds.list_documents(id="wdfxb5t547d") +doc = doc[0] chunk = doc.add_chunk(content="xxxxxxx") -chunk.content = "sdfx" -chunk.save() +chunk.update({"content":"sdfx...}) ``` --- @@ -654,7 +635,7 @@ chunk.save() ## Retrieval ```python -RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None, offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk] +RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk] ``` ### Parameters @@ -691,6 +672,15 @@ The weight of vector cosine similarity, 1 - x is the term similarity weight. Def Number of records engaged in vector cosine computaton. Defaults to `1024`. +#### rerank_id:`str` +ID of the rerank model. Defaults to `None`. + +#### keyword:`bool` +Indicating whether keyword-based matching is enabled (True) or disabled (False). + +#### highlight:`bool` + +Specifying whether to enable highlighting of matched terms in the results (True) or not (False). ### Returns List[Chunk] @@ -701,18 +691,17 @@ List[Chunk] from ragflow import RAGFlow rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -ds = rag.get_dataset(name="ragflow") +ds = rag.list_datasets(name="ragflow") +ds = ds[0] name = 'ragflow_test.txt' -path = 'test_data/ragflow_test.txt' +path = './test_data/ragflow_test.txt' rag.create_document(ds, name=name, blob=open(path, "rb").read()) -doc = rag.get_document(name=name) -doc.async_parse() -# Wait for parsing to complete -for progress, msg in doc.join(interval=5, timeout=30): - print(progress, msg) -for c in rag.retrieval(question="What's ragflow?", - datasets=[ds], documents=[doc], - offset=0, limit=6, similarity_threshold=0.1, +doc = ds.list_documents(name=name) +doc = doc[0] +ds.async_parse_documents([doc.id]) +for c in rag.retrieve(question="What's ragflow?", + datasets=[ds.id], documents=[doc.id], + offset=1, limit=30, similarity_threshold=0.2, vector_similarity_weight=0.3, top_k=1024 ): diff --git a/sdk/python/ragflow/modules/chunk.py b/sdk/python/ragflow/modules/chunk.py index e1cdd50eb..49132af91 100644 --- a/sdk/python/ragflow/modules/chunk.py +++ b/sdk/python/ragflow/modules/chunk.py @@ -17,32 +17,11 @@ class Chunk(Base): res_dict.pop(k) super().__init__(rag, res_dict) - def delete(self) -> bool: - """ - Delete the chunk in the document. - """ - res = self.post('/doc/chunk/rm', - {"document_id": self.document_id, 'chunk_ids': [self.id]}) - res = res.json() - if res.get("retmsg") == "success": - return True - raise Exception(res["retmsg"]) - def save(self) -> bool: - """ - Save the document details to the server. - """ - res = self.post('/doc/chunk/set', - {"chunk_id": self.id, - "knowledgebase_id": self.knowledgebase_id, - "name": self.document_name, - "content": self.content, - "important_keywords": self.important_keywords, - "document_id": self.document_id, - "available": self.available, - }) + def update(self,update_message:dict): + res = self.put(f"/dataset/{self.knowledgebase_id}/document/{self.document_id}/chunk/{self.id}",update_message) res = res.json() - if res.get("retmsg") == "success": - return True - raise Exception(res["retmsg"]) + if res.get("code") != 0 : + raise Exception(res["message"]) + diff --git a/sdk/python/ragflow/modules/dataset.py b/sdk/python/ragflow/modules/dataset.py index c58cc30f6..81615ac52 100644 --- a/sdk/python/ragflow/modules/dataset.py +++ b/sdk/python/ragflow/modules/dataset.py @@ -65,3 +65,14 @@ class DataSet(Base): if res.get("code") != 0: raise Exception(res["message"]) + def async_parse_documents(self,document_ids): + res = self.post(f"/dataset/{self.id}/chunk",{"document_ids":document_ids}) + res = res.json() + if res.get("code") != 0: + raise Exception(res.get("message")) + + def async_cancel_parse_documents(self,document_ids): + res = self.rm(f"/dataset/{self.id}/chunk",{"document_ids":document_ids}) + res = res.json() + if res.get("code") != 0: + raise Exception(res.get("message")) diff --git a/sdk/python/ragflow/modules/document.py b/sdk/python/ragflow/modules/document.py index 55c658d67..97766115a 100644 --- a/sdk/python/ragflow/modules/document.py +++ b/sdk/python/ragflow/modules/document.py @@ -1,7 +1,10 @@ import time +from PIL.ImageFile import raise_oserror + from .base import Base from .chunk import Chunk +from typing import List class Document(Base): @@ -29,160 +32,28 @@ class Document(Base): res_dict.pop(k) super().__init__(rag, res_dict) - def update(self,update_message:dict) -> bool: - """ - Save the document details to the server. - """ - res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message) + def list_chunks(self,offset=0, limit=30, keywords="", id:str=None): + data={"document_id": self.id,"keywords": keywords,"offset":offset,"limit":limit,"id":id} + res = self.get(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', data) res = res.json() - if res.get("code") != 0: - raise Exception(res["message"]) + if res.get("code") == 0: + chunks=[] + for data in res["data"].get("chunks"): + chunk = Chunk(self.rag,data) + chunks.append(chunk) + return chunks + raise Exception(res.get("message")) - def delete(self) -> bool: - """ - Delete the document from the server. - """ - res = self.rm('/doc/delete', - {"document_id": self.id}) - res = res.json() - if res.get("retmsg") == "success": - return True - raise Exception(res["retmsg"]) - - def download(self) -> bytes: - """ - Download the document content from the server using the Flask API. - - :return: The downloaded document content in bytes. - """ - # Construct the URL for the API request using the document ID and knowledge base ID - res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}") - - # Check the response status code to ensure the request was successful - if res.status_code == 200: - # Return the document content as bytes - return res.content - else: - # Handle the error and raise an exception - raise Exception( - f"Failed to download document. Server responded with: {res.status_code}, {res.text}" - ) - - def async_parse(self): - """ - Initiate document parsing asynchronously without waiting for completion. - """ - try: - # Construct request data including document ID and run status (assuming 1 means to run) - data = {"document_ids": [self.id], "run": 1} - - # Send a POST request to the specified parsing status endpoint to start parsing - res = self.post(f'/doc/run', data) - - # Check the server response status code - if res.status_code != 200: - raise Exception(f"Failed to start async parsing: {res.text}") - - print("Async parsing started successfully.") - - except Exception as e: - # Catch and handle exceptions - print(f"Error occurred during async parsing: {str(e)}") - raise - - import time - - def join(self, interval=5, timeout=3600): - """ - Wait for the asynchronous parsing to complete and yield parsing progress periodically. - - :param interval: The time interval (in seconds) for progress reports. - :param timeout: The timeout (in seconds) for the parsing operation. - :return: An iterator yielding parsing progress and messages. - """ - start_time = time.time() - while time.time() - start_time < timeout: - # Check the parsing status - res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]}) - res_data = res.json() - data = res_data.get("data", []) - - # Retrieve progress and status message - progress = data.get("progress", 0) - progress_msg = data.get("status", "") - - yield progress, progress_msg # Yield progress and message - - if progress == 100: # Parsing completed - break - - time.sleep(interval) - - def cancel(self): - """ - Cancel the parsing task for the document. - """ - try: - # Construct request data, including document ID and action to cancel (assuming 2 means cancel) - data = {"document_ids": [self.id], "run": 2} - - # Send a POST request to the specified parsing status endpoint to cancel parsing - res = self.post(f'/doc/run', data) - - # Check the server response status code - if res.status_code != 200: - print("Failed to cancel parsing. Server response:", res.text) - else: - print("Parsing cancelled successfully.") - - except Exception as e: - print(f"Error occurred during async parsing cancellation: {str(e)}") - raise - - def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None): - """ - List all chunks associated with this document by calling the external API. - - Args: - page (int): The page number to retrieve (default 1). - size (int): The number of chunks per page (default 30). - keywords (str): Keywords for searching specific chunks (default ""). - available_int (int): Filter for available chunks (optional). - - Returns: - list: A list of chunks returned from the API. - """ - data = { - "document_id": self.id, - "page": page, - "size": size, - "keywords": keywords, - "offset":offset, - "limit":limit - } - - if available_int is not None: - data["available_int"] = available_int - - res = self.post(f'/doc/chunk/list', data) - if res.status_code == 200: - res_data = res.json() - if res_data.get("retmsg") == "success": - chunks=[] - for chunk_data in res_data["data"].get("chunks", []): - chunk=Chunk(self.rag,chunk_data) - chunks.append(chunk) - return chunks - else: - raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}") - else: - raise Exception(f"API request failed with status code {res.status_code}") def add_chunk(self, content: str): - res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content}) - if res.status_code == 200: - res_data = res.json().get("data") - chunk_data = res_data.get("chunk") - return Chunk(self.rag,chunk_data) - else: - raise Exception(f"Failed to add chunk: {res.status_code} {res.text}") + res = self.post(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', {"content":content}) + res = res.json() + if res.get("code") == 0: + return Chunk(self.rag,res["data"].get("chunk")) + raise Exception(res.get("message")) + + def delete_chunks(self,ids:List[str]): + res = self.rm(f"dataset/{self.knowledgebase_id}/document/{self.id}/chunk",{"ids":ids}) + res = res.json() + if res.get("code")!=0: + raise Exception(res.get("message")) \ No newline at end of file diff --git a/sdk/python/ragflow/modules/session.py b/sdk/python/ragflow/modules/session.py index 56f4e2b84..e9805520d 100644 --- a/sdk/python/ragflow/modules/session.py +++ b/sdk/python/ragflow/modules/session.py @@ -15,8 +15,8 @@ class Session(Base): for message in self.messages: if "reference" in message: message.pop("reference") - res = self.post(f"/chat/{self.chat_id}/session/{self.id}/completion", - {"question": question, "stream": True}, stream=stream) + res = self.post(f"/chat/{self.chat_id}/completion", + {"question": question, "stream": True,"session_id":self.id}, stream=stream) for line in res.iter_lines(): line = line.decode("utf-8") if line.startswith("{"): @@ -82,3 +82,4 @@ class Chunk(Base): self.term_similarity = None self.positions = None super().__init__(rag, res_dict) + diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index c50c61929..792d049e5 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -158,105 +158,30 @@ class RAGFlow: raise Exception(res["message"]) - - def async_parse_documents(self, doc_ids): - """ - Asynchronously start parsing multiple documents without waiting for completion. - - :param doc_ids: A list containing multiple document IDs. - """ - try: - if not doc_ids or not isinstance(doc_ids, list): - raise ValueError("doc_ids must be a non-empty list of document IDs") - - data = {"document_ids": doc_ids, "run": 1} - - res = self.post(f'/doc/run', data) - - if res.status_code != 200: - raise Exception(f"Failed to start async parsing for documents: {res.text}") - - print(f"Async parsing started successfully for documents: {doc_ids}") - - except Exception as e: - print(f"Error occurred during async parsing for documents: {str(e)}") - raise - - def async_cancel_parse_documents(self, doc_ids): - """ - Cancel the asynchronous parsing of multiple documents. - - :param doc_ids: A list containing multiple document IDs. - """ - try: - if not doc_ids or not isinstance(doc_ids, list): - raise ValueError("doc_ids must be a non-empty list of document IDs") - data = {"document_ids": doc_ids, "run": 2} - res = self.post(f'/doc/run', data) - - if res.status_code != 200: - raise Exception(f"Failed to cancel async parsing for documents: {res.text}") - - print(f"Async parsing canceled successfully for documents: {doc_ids}") - - except Exception as e: - print(f"Error occurred during canceling parsing for documents: {str(e)}") - raise - - def retrieval(self, - question, - datasets=None, - documents=None, - offset=0, - limit=6, - similarity_threshold=0.1, - vector_similarity_weight=0.3, - top_k=1024): - """ - Perform document retrieval based on the given parameters. - - :param question: The query question. - :param datasets: A list of datasets (optional, as documents may be provided directly). - :param documents: A list of documents (if specific documents are provided). - :param offset: Offset for the retrieval results. - :param limit: Maximum number of retrieval results. - :param similarity_threshold: Similarity threshold. - :param vector_similarity_weight: Weight of vector similarity. - :param top_k: Number of top most similar documents to consider (for pre-filtering or ranking). - - Note: This is a hypothetical implementation and may need adjustments based on the actual backend service API. - """ - try: - data = { - "question": question, - "datasets": datasets if datasets is not None else [], - "documents": [doc.id if hasattr(doc, 'id') else doc for doc in - documents] if documents is not None else [], + def retrieve(self, question="",datasets=None,documents=None, offset=1, limit=30, similarity_threshold=0.2,vector_similarity_weight=0.3,top_k=1024,rerank_id:str=None,keyword:bool=False,): + data_params = { "offset": offset, "limit": limit, "similarity_threshold": similarity_threshold, "vector_similarity_weight": vector_similarity_weight, "top_k": top_k, "knowledgebase_id": datasets, + "rerank_id":rerank_id, + "keyword":keyword + } + data_json ={ + "question": question, + "datasets": datasets, + "documents": documents } # Send a POST request to the backend service (using requests library as an example, actual implementation may vary) - res = self.post(f'/doc/retrieval_test', data) - - # Check the response status code - if res.status_code == 200: - res_data = res.json() - if res_data.get("retmsg") == "success": - chunks = [] - for chunk_data in res_data["data"].get("chunks", []): - chunk = Chunk(self, chunk_data) - chunks.append(chunk) - return chunks - else: - raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}") - else: - raise Exception(f"API request failed with status code {res.status_code}") - - except Exception as e: - print(f"An error occurred during retrieval: {e}") - raise + res = self.get(f'/retrieval', data_params,data_json) + res = res.json() + if res.get("code") ==0: + chunks=[] + for chunk_data in res["data"].get("chunks"): + chunk=Chunk(self,chunk_data) + chunks.append(chunk) + return chunks + raise Exception(res.get("message")) diff --git a/sdk/python/test/t_document.py b/sdk/python/test/t_document.py index e332e8145..d7c13a7af 100644 --- a/sdk/python/test/t_document.py +++ b/sdk/python/test/t_document.py @@ -63,17 +63,13 @@ class TestDocument(TestSdk): # Check if the retrieved document is of type Document if isinstance(doc, Document): # Download the document content and save it to a file - try: - with open("ragflow.txt", "wb+") as file: - file.write(doc.download()) - # Print the document object for debugging - print(doc) + with open("./ragflow.txt", "wb+") as file: + file.write(doc.download()) + # Print the document object for debugging + print(doc) - # Assert that the download was successful - assert True, "Document downloaded successfully." - except Exception as e: - # If an error occurs, raise an assertion error - assert False, f"Failed to download document, error: {str(e)}" + # Assert that the download was successful + assert True, f"Failed to download document, error: {doc}" else: # If the document retrieval fails, assert failure assert False, f"Failed to get document, error: {doc}" @@ -100,7 +96,7 @@ class TestDocument(TestSdk): blob2 = b"Sample document content for ingestion test222." list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}] ds.upload_documents(list_1) - for d in ds.list_docs(keywords="test", offset=0, limit=12): + for d in ds.list_documents(keywords="test", offset=0, limit=12): assert isinstance(d, Document), "Failed to upload documents" def test_delete_documents_in_dataset_with_success(self): @@ -123,16 +119,11 @@ class TestDocument(TestSdk): blob1 = b"Sample document content for ingestion test333." name2 = "Test Document444.txt" blob2 = b"Sample document content for ingestion test444." - name3 = 'test.txt' - path = 'test_data/test.txt' - rag.create_document(ds, name=name3, blob=open(path, "rb").read()) - rag.create_document(ds, name=name1, blob=blob1) - rag.create_document(ds, name=name2, blob=blob2) - for d in ds.list_docs(keywords="document", offset=0, limit=12): + ds.upload_documents([{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]) + for d in ds.list_documents(keywords="document", offset=0, limit=12): assert isinstance(d, Document) - d.delete() - print(d) - remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12) + ds.delete_documents([d.id]) + remaining_docs = ds.list_documents(keywords="rag", offset=0, limit=12) assert len(remaining_docs) == 0, "Documents were not properly deleted." def test_parse_and_cancel_document(self): @@ -144,16 +135,15 @@ class TestDocument(TestSdk): # Define the document name and path name3 = 'westworld.pdf' - path = 'test_data/westworld.pdf' + path = './test_data/westworld.pdf' # Create a document in the dataset using the file path - rag.create_document(ds, name=name3, blob=open(path, "rb").read()) + ds.upload_documents({"name":name3, "blob":open(path, "rb").read()}) # Retrieve the document by name - doc = rag.get_document(name="westworld.pdf") - - # Initiate asynchronous parsing - doc.async_parse() + doc = rag.list_documents(name="westworld.pdf") + doc = doc[0] + ds.async_parse_documents(document_ids=[]) # Print message to confirm asynchronous parsing has been initiated print("Async parsing initiated")