Refactor Chunk API (#2855)

### What problem does this PR solve? Refactor Chunk API #2846 ### Type of change - [x] Refactoring --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2025-08-12 15:59:04 +08:00 · 2024-10-16 18:41:24 +08:00 · 2024-10-16 18:41:24 +08:00 · dab92ac1e8
commit dab92ac1e8
parent b9fa00f341
11 changed files with 760 additions and 791 deletions
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@ -119,13 +119,11 @@ def update_doc(tenant_id, dataset_id, document_id):
        if informs:
            e, file = FileService.get_by_id(informs[0].file_id)
            FileService.update_by_id(file.id, {"name": req["name"]})
    if "parser_config" in req:
        DocumentService.update_parser_config(doc.id, req["parser_config"])
    if "parser_method" in req:
        if doc.parser_id.lower() == req["parser_method"].lower():
-            if "parser_config" in req:
+                return get_result()
                if req["parser_config"] == doc.parser_config:
                    return get_result(retcode=RetCode.SUCCESS)
            else:
                return get_result(retcode=RetCode.SUCCESS)
        if doc.type == FileType.VISUAL or re.search(
                r"\.(ppt|pptx|pages)$", doc.name):
@ -146,8 +144,6 @@ def update_doc(tenant_id, dataset_id, document_id):
                return get_error_data_result(retmsg="Tenant not found!")
            ELASTICSEARCH.deleteByQuery(
                Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
    if "parser_config" in req:
        DocumentService.update_parser_config(doc.id, req["parser_config"])
    return get_result()
@ -258,6 +254,8 @@ def parse(tenant_id,dataset_id):
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
    req = request.json
    if not req.get("document_ids"):
        return get_error_data_result("`document_ids` is required")
    for id in req["document_ids"]:
        if not DocumentService.query(id=id,kb_id=dataset_id):
            return get_error_data_result(retmsg=f"You don't own the document {id}.")
@ -283,9 +281,14 @@ def stop_parsing(tenant_id,dataset_id):
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
    req = request.json
    if not req.get("document_ids"):
        return get_error_data_result("`document_ids` is required")
    for id in req["document_ids"]:
-        if not DocumentService.query(id=id,kb_id=dataset_id):
+        doc = DocumentService.query(id=id, kb_id=dataset_id)
        if not doc:
            return get_error_data_result(retmsg=f"You don't own the document {id}.")
        if doc[0].progress == 100.0 or doc[0].progress == 0.0:
            return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
        info = {"run": "2", "progress": 0}
        DocumentService.update_by_id(id, info)
        # if str(req["run"]) == TaskStatus.CANCEL.value:
@ -297,7 +300,7 @@ def stop_parsing(tenant_id,dataset_id):
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET'])
@token_required
-def list_chunk(tenant_id,dataset_id,document_id):
+def list_chunks(tenant_id,dataset_id,document_id):
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
    doc=DocumentService.query(id=document_id, kb_id=dataset_id)
@ -309,16 +312,13 @@ def list_chunk(tenant_id,dataset_id,document_id):
    page = int(req.get("offset", 1))
    size = int(req.get("limit", 30))
    question = req.get("keywords", "")
    try:
    query = {
        "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
    }
        if "available_int" in req:
            query["available_int"] = int(req["available_int"])
    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
    res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
    origin_chunks = []
    sign = 0
    for id in sres.ids:
        d = {
            "chunk_id": id,
@ -340,7 +340,15 @@ def list_chunk(tenant_id,dataset_id,document_id):
            d["positions"] = poss
        origin_chunks.append(d)
-            ##rename keys
+        if req.get("id"):
            if req.get("id") == id:
                origin_chunks.clear()
                origin_chunks.append(d)
                sign = 1
                break
    if req.get("id"):
        if sign == 0:
            return get_error_data_result(f"Can't find this chunk {req.get('id')}")
    for chunk in origin_chunks:
        key_mapping = {
            "chunk_id": "id",
@ -355,11 +363,7 @@ def list_chunk(tenant_id,dataset_id,document_id):
            renamed_chunk[new_key] = value
        res["chunks"].append(renamed_chunk)
    return get_result(data=res)
-    except Exception as e:
+
        if str(e).find("not_found") > 0:
            return get_result(retmsg=f'No chunk found!',
                                   retcode=RetCode.DATA_ERROR)
        return server_error_response(e)
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
@ -374,6 +378,9 @@ def create(tenant_id,dataset_id,document_id):
    req = request.json
    if not req.get("content"):
        return get_error_data_result(retmsg="`content` is required")
    if "important_keywords" in req:
        if type(req["important_keywords"]) != list:
            return get_error_data_result("`important_keywords` is required to be a list")
    md5 = hashlib.md5()
    md5.update((req["content"] + document_id).encode("utf-8"))
@ -381,8 +388,8 @@ def create(tenant_id,dataset_id,document_id):
    d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
         "content_with_weight": req["content"]}
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
-    d["important_kwd"] = req.get("important_kwd", [])
+    d["important_kwd"] = req.get("important_keywords", [])
-    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
+    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", [])))
    d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
    d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
    d["kb_id"] = [doc.kb_id]
@ -432,12 +439,12 @@ def rm_chunk(tenant_id,dataset_id,document_id):
    req = request.json
    if not req.get("chunk_ids"):
        return get_error_data_result("`chunk_ids` is required")
    query = {
        "doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
    for chunk_id in req.get("chunk_ids"):
-        res = ELASTICSEARCH.get(
+        if chunk_id not in sres.ids:
-            chunk_id, search.index_name(
+            return get_error_data_result(f"Chunk {chunk_id} not found")
                tenant_id))
        if not res.get("found"):
            return server_error_response(f"Chunk {chunk_id} not found")
    if not ELASTICSEARCH.deleteByQuery(
            Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
        return get_error_data_result(retmsg="Index updating failure")
@ -451,23 +458,35 @@ def rm_chunk(tenant_id,dataset_id,document_id):
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
@token_required
 def set(tenant_id,dataset_id,document_id,chunk_id):
    try:
        res = ELASTICSEARCH.get(
        chunk_id, search.index_name(
            tenant_id))
-    if not res.get("found"):
+    except Exception as e:
-        return get_error_data_result(f"Chunk {chunk_id} not found")
+        return get_error_data_result(f"Can't find this chunk {chunk_id}")
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
    doc = DocumentService.query(id=document_id, kb_id=dataset_id)
    if not doc:
        return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
    doc = doc[0]
    query = {
        "doc_ids": [document_id], "page": 1, "size": 1024, "question": "", "sort": True
    }
    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
    if chunk_id not in sres.ids:
        return get_error_data_result(f"You don't own the chunk {chunk_id}")
    req = request.json
    content=res["_source"].get("content_with_weight")
    d = {
        "id": chunk_id,
-        "content_with_weight": req.get("content",res.get["content_with_weight"])}
+        "content_with_weight": req.get("content",content)}
-    d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
+    d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
-    d["important_kwd"] = req.get("important_keywords",[])
+    if "important_keywords" in req:
        if type(req["important_keywords"]) != list:
            return get_error_data_result("`important_keywords` is required to be a list")
        d["important_kwd"] = req.get("important_keywords")
        d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
    if "available" in req:
        d["available_int"] = req["available"]
@ -478,7 +497,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id):
        arr = [
            t for t in re.split(
                r"[\n\t]",
-                req["content"]) if len(t) > 1]
+                d["content_with_weight"]) if len(t) > 1]
        if len(arr) != 2:
            return get_error_data_result(
                retmsg="Q&A must be separated by TAB/ENTER key.")
@ -486,7 +505,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id):
        d = beAdoc(d, arr[0], arr[1], not any(
            [rag_tokenizer.is_chinese(t) for t in q + a]))
-    v, c = embd_mdl.encode([doc.name, req["content"]])
+    v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
    v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
    d["q_%d_vec" % len(v)] = v.tolist()
    ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
@ -505,7 +524,7 @@ def retrieval_test(tenant_id):
    for id in kb_id:
        if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
            return get_error_data_result(f"You don't own the dataset {id}.")
-    if "question" not in req_json:
+    if "question" not in req:
        return get_error_data_result("`question` is required.")
    page = int(req.get("offset", 1))
    size = int(req.get("limit", 30))
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
@ -24,10 +24,9 @@ from api.utils import get_uuid
 from api.utils.api_utils import get_error_data_result
 from api.utils.api_utils import get_result, token_required
@manager.route('/chat/<chat_id>/session', methods=['POST'])
@token_required
-def create(tenant_id, chat_id):
+def create(tenant_id,chat_id):
    req = request.json
    req["dialog_id"] = chat_id
    dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value)
@ -51,14 +50,13 @@ def create(tenant_id, chat_id):
    del conv["reference"]
    return get_result(data=conv)
@manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT'])
@token_required
-def update(tenant_id, chat_id, session_id):
+def update(tenant_id,chat_id,session_id):
    req = request.json
    req["dialog_id"] = chat_id
    conv_id = session_id
-    conv = ConversationService.query(id=conv_id, dialog_id=chat_id)
+    conv = ConversationService.query(id=conv_id,dialog_id=chat_id)
    if not conv:
        return get_error_data_result(retmsg="Session does not exist")
    if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
@ -74,16 +72,30 @@ def update(tenant_id, chat_id, session_id):
    return get_result()
-@manager.route('/chat/<chat_id>/session/<session_id>/completion', methods=['POST'])
+@manager.route('/chat/<chat_id>/completion', methods=['POST'])
@token_required
-def completion(tenant_id, chat_id, session_id):
+def completion(tenant_id,chat_id):
    req = request.json
    # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [
    #    {"role": "user", "content": "上海有吗？"}
    # ]}
    if not req.get("session_id"):
        conv = {
            "id": get_uuid(),
            "dialog_id": chat_id,
            "name": req.get("name", "New session"),
            "message": [{"role": "assistant", "content": "Hi! I am your assistant，can I help you?"}]
        }
        if not conv.get("name"):
            return get_error_data_result(retmsg="Name can not be empty.")
        ConversationService.save(**conv)
        e, conv = ConversationService.get_by_id(conv["id"])
        session_id=conv.id
    else:
        session_id = req.get("session_id")
    if not req.get("question"):
        return get_error_data_result(retmsg="Please input your question.")
-    conv = ConversationService.query(id=session_id, dialog_id=chat_id)
+    conv = ConversationService.query(id=session_id,dialog_id=chat_id)
    if not conv:
        return get_error_data_result(retmsg="Session does not exist")
    conv = conv[0]
@ -117,6 +129,7 @@ def completion(tenant_id, chat_id, session_id):
        conv.message[-1] = {"role": "assistant", "content": ans["answer"],
                            "id": message_id, "prompt": ans.get("prompt", "")}
        ans["id"] = message_id
        ans["session_id"]=session_id
    def stream():
        nonlocal dia, msg, req, conv
@ -127,7 +140,7 @@ def completion(tenant_id, chat_id, session_id):
            ConversationService.update_by_id(conv.id, conv.to_dict())
        except Exception as e:
            yield "data:" + json.dumps({"code": 500, "message": str(e),
-                                        "data": {"answer": "**ERROR**: " + str(e), "reference": []}},
+                                        "data": {"answer": "**ERROR**: " + str(e),"reference": []}},
                                       ensure_ascii=False) + "\n\n"
        yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n"
@ -148,15 +161,14 @@ def completion(tenant_id, chat_id, session_id):
            break
        return get_result(data=answer)
@manager.route('/chat/<chat_id>/session', methods=['GET'])
@token_required
-def list(chat_id, tenant_id):
+def list(chat_id,tenant_id):
    if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value):
        return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.")
    id = request.args.get("id")
    name = request.args.get("name")
-    session = ConversationService.query(id=id, name=name, dialog_id=chat_id)
+    session = ConversationService.query(id=id,name=name,dialog_id=chat_id)
    if not session:
        return get_error_data_result(retmsg="The session doesn't exist")
    page_number = int(request.args.get("page", 1))
@ -166,7 +178,7 @@ def list(chat_id, tenant_id):
        desc = False
    else:
        desc = True
-    convs = ConversationService.get_list(chat_id, page_number, items_per_page, orderby, desc, id, name)
+    convs = ConversationService.get_list(chat_id,page_number,items_per_page,orderby,desc,id,name)
    if not convs:
        return get_result(data=[])
    for conv in convs:
@ -201,17 +213,16 @@ def list(chat_id, tenant_id):
        del conv["reference"]
    return get_result(data=convs)
@manager.route('/chat/<chat_id>/session', methods=["DELETE"])
@token_required
-def delete(tenant_id, chat_id):
+def delete(tenant_id,chat_id):
    if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
        return get_error_data_result(retmsg="You don't own the chat")
    ids = request.json.get("ids")
    if not ids:
        return get_error_data_result(retmsg="`ids` is required in deleting operation")
    for id in ids:
-        conv = ConversationService.query(id=id, dialog_id=chat_id)
+        conv = ConversationService.query(id=id,dialog_id=chat_id)
        if not conv:
            return get_error_data_result(retmsg="The chat doesn't own the session")
        ConversationService.delete_by_id(id)
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -61,14 +61,13 @@ class DocumentService(CommonService):
            docs = docs.where(
                fn.LOWER(cls.model.name).contains(keywords.lower())
            )
        count = docs.count()
        if desc:
            docs = docs.order_by(cls.model.getter_by(orderby).desc())
        else:
            docs = docs.order_by(cls.model.getter_by(orderby).asc())
        docs = docs.paginate(page_number, items_per_page)
-
+        count = docs.count()
        return list(docs.dicts()), count
--- a/api/http_api.md
+++ b/api/http_api.md
@ -432,18 +432,71 @@ The error response includes a JSON object like the following:
 }
 ```
 ## Delete files from a dataset
 **DELETE** `/api/v1/dataset/{dataset_id}/document `
 Delete files from a dataset
 ### Request
 - Method: DELETE
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/document`
 - Headers:
  - 'Content-Type: application/json'
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `ids`:List[str]
 #### Request example
 ```bash
 curl --request DELETE \
  --url http://{address}/api/v1/dataset/{dataset_id}/document \
  --header 'Content-Type: application/json' \
  --header 'Authorization: {YOUR ACCESS TOKEN}' \
  --data '{
  "ids": ["id_1","id_2"]
  }'
 ```
 #### Request parameters
 - `"ids"`: (*Body parameter*)
    The ids of teh documents to be deleted
 ### Response
 The successful response includes a JSON object like the following:
 ```json
 {
    "code": 0
 }.
 ```
 - `"error_code"`: `integer`  
  `0`: The operation succeeds.
 The error response includes a JSON object like the following:
 ```json
 {
    "code": 102,
    "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005."
 }
 ```
 ## Download a file from a dataset
 **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}`
-Downloads files from a dataset. 
+Downloads a file from a dataset. 
 ### Request
 - Method: GET
- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}`
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Output:
  - '{FILE_NAME}'
@ -451,10 +504,9 @@ Downloads files from a dataset.
 ```bash
 curl --request GET \
-     --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
-     --header 'Content-Type: application/json' \
+  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+  --output ./ragflow.txt
     --output '{FILE_NAME}'
 ```
 #### Request parameters
@ -466,7 +518,7 @@ curl --request GET \
 ### Response
-The successful response includes a JSON object like the following:
+The successful response includes a text object like the following:
 ```text
 test_2.
@ -596,92 +648,39 @@ Update a file in a dataset
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-
+- Body:
  - `name`:`string`
  - `parser_method`:`string`
  - `parser_config`:`dict`
 #### Request example
 ```bash
 curl --request PUT \
-  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
+  --url http://{address}/api/v1/dataset/{dataset_id}/info/{document_id} \
  --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \
  --header 'Content-Type: application/json' \
  --data '{
  "name": "manual.txt", 
  "thumbnail": null, 
  "knowledgebase_id": "779333c0758611ef910f0242ac120004", 
  "parser_method": "manual", 
-  "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。；！？", "layout_recognize": true, "task_page_size": 12}, 
+  "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。；！？", "layout_recognize": true, "task_page_size": 12}
  "source_type": "local", "type": "doc", 
  "created_by": "134408906b6811efbcd20242ac120005", 
  "size": 0, "token_count": 0, "chunk_count": 0, 
  "progress": 0.0, 
  "progress_msg": "", 
  "process_begin_at": null, 
  "process_duration": 0.0
  }'
 ```
 #### Request parameters
 - `"thumbnail"`: (*Body parameter*)  
    Thumbnail image of the document.  
    - `""`
 - `"knowledgebase_id"`: (*Body parameter*)  
    Knowledge base ID related to the document.  
    - `""`
 - `"parser_method"`: (*Body parameter*)  
    Method used to parse the document.  
-    - `""`
+
 - `"parser_config"`: (*Body parameter*)  
    Configuration object for the parser.  
    - If the value is `None`, a dictionary with default values will be generated.
 - `"source_type"`: (*Body parameter*)  
    Source type of the document.  
    - `""`
 - `"type"`: (*Body parameter*)  
    Type or category of the document.  
    - `""`
 - `"created_by"`: (*Body parameter*)  
    Creator of the document.  
    - `""`
 - `"name"`: (*Body parameter*)  
    Name or title of the document.  
    - `""`
 - `"size"`: (*Body parameter*)  
    Size of the document in bytes or some other unit.  
    - `0`
 - `"token_count"`: (*Body parameter*)  
    Number of tokens in the document.  
    - `0`
 - `"chunk_count"`: (*Body parameter*)  
    Number of chunks the document is split into.  
    - `0`
 - `"progress"`: (*Body parameter*)  
    Current processing progress as a percentage.  
    - `0.0`
 - `"progress_msg"`: (*Body parameter*)  
    Message indicating current progress status.  
    - `""`
 - `"process_begin_at"`: (*Body parameter*)  
    Start time of the document processing.  
    - `None`
 - `"process_duration"`: (*Body parameter*)  
    Duration of the processing in seconds or minutes.  
    - `0.0`
 ### Response
@ -712,34 +711,34 @@ Parse files into chunks in a dataset
 ### Request
 - Method: POST
- URL: `/api/v1/dataset/{dataset_id}/chunk`
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk `
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `document_ids`:List[str]
 #### Request example
-```shell
+```bash
 curl --request POST \
    --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
    --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+    --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
-     --raw '{
+    --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}'
         "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
     }'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
- `"documents"`: (*Body parameter*)
+- `"document_ids"`:(*Body parameter*)  
-  - Documents to parse
+  The ids of the documents to be parsed
 ### Response
 The successful response includes a JSON object like the following:
-```shell
+```json
 {
    "code": 0
 }
@ -747,10 +746,10 @@ The successful response includes a JSON object like the following:
 The error response includes a JSON object like the following:
-```shell
+```json
 {
-    "code": 3016,
+    "code": 102,
-    "message": "Can't connect database"
+    "message": "`document_ids` is required"
 }
 ```
@ -762,35 +761,35 @@ Stop file parsing
 ### Request
- Method: POST
+- Method: DELETE
- URL: `/api/v1/dataset/{dataset_id}/chunk`
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-
+- Body:
  - `document_ids`:List[str]
 #### Request example
-```shell
+```bash
 curl --request DELETE \
   --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
   --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+   --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
-     --raw '{
+   --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}'
         "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
     }'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
- `"documents"`: (*Body parameter*)
+- `"document_ids"`:(*Body parameter*)  
-  - Documents to stop parsing
+  The ids of the documents to be parsed
 ### Response
 The successful response includes a JSON object like the following:
-```shell
+```json
 {
    "code": 0
 }
@ -798,33 +797,31 @@ The successful response includes a JSON object like the following:
 The error response includes a JSON object like the following:
-```shell
+```json
 {
-    "code": 3016,
+    "code": 102,
-    "message": "Can't connect database"
+    "message": "`document_ids` is required"
 }
 ```
 ## Get document chunk list
-**GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
+**GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}`
 Get document chunk list
 ### Request
 - Method: GET
- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
-```shell
+```bash
 curl --request GET \
-     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id} \
     --header 'Content-Type: application/json' \
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' 
 ```
@ -832,70 +829,66 @@ curl --request GET \
 - `"dataset_id"`: (*Path parameter*)
 - `"document_id"`: (*Path parameter*)
-
+- `"offset"`(*Filter parameter*)  
  The beginning number of records for paging.
 - `"keywords"`(*Filter parameter*)  
  List chunks whose name has the given keywords
 - `"limit"`(*Filter parameter*)  
  Records number to return
 - `"id"`(*Filter parameter*)  
  The id of chunk to be got
 ### Response
 The successful response includes a JSON object like the following:
-```shell
+```json
 {
-    "code": 0
+    "code": 0,
    "data": {
-        "chunks": [
+        "chunks": [],
            {
                "available_int": 1,
                "content": "<em>advantag</em>of ragflow increas accuraci and relev:by incorpor retriev inform , ragflow can gener respons that are more accur",
                "document_keyword": "ragflow_test.txt",
                "document_id": "77df9ef4759a11ef8bdd0242ac120004",
                "id": "4ab8c77cfac1a829c8d5ed022a0808c0",
                "image_id": "",
                "important_keywords": [],
                "positions": [
                    ""
                ]
            }
        ],
        "doc": {
-            "chunk_count": 5,
+            "chunk_num": 0,
-            "create_date": "Wed, 18 Sep 2024 08:46:16 GMT",
+            "create_date": "Sun, 29 Sep 2024 03:47:29 GMT",
-            "create_time": 1726649176833,
+            "create_time": 1727581649216,
-            "created_by": "134408906b6811efbcd20242ac120005",
+            "created_by": "69736c5e723611efb51b0242ac120007",
-            "id": "77df9ef4759a11ef8bdd0242ac120004",
+            "id": "8cb781ec7e1511ef98ac0242ac120006",
-            "knowledgebase_id": "77d9d24e759a11ef880c0242ac120004",
+            "kb_id": "c7ee74067a2c11efb21c0242ac120006",
-            "location": "ragflow_test.txt",
+            "location": "明天的天气是晴天.txt",
-            "name": "ragflow_test.txt",
+            "name": "明天的天气是晴天.txt",
            "parser_config": {
-                "chunk_token_count": 128,
+                "pages": [
-                "delimiter": "\n!?。；！？",
+                    [
-                "layout_recognize": true,
+                        1,
-                "task_page_size": 12
+                        1000000
                    ]
                ]
            },
-            "parser_method": "naive",
+            "parser_id": "naive",
-            "process_begin_at": "Wed, 18 Sep 2024 08:46:16 GMT",
+            "process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT",
-            "process_duation": 7.3213,
+            "process_duation": 1435.37,
-            "progress": 1.0,
+            "progress": 0.0370833,
-            "progress_msg": "\nTask has been received.\nStart to parse.\nFinish parsing.\nFinished slicing files(5). Start to embedding the content.\nFinished embedding(6.16)! Start to build index!\nDone!",
+            "progress_msg": "\nTask has been received.",
-            "run": "3",
+            "run": "1",
-            "size": 4209,
+            "size": 24,
            "source_type": "local",
            "status": "1",
            "thumbnail": null,
-            "token_count": 746,
+            "token_num": 0,
            "type": "doc",
-            "update_date": "Wed, 18 Sep 2024 08:46:23 GMT",
+            "update_date": "Tue, 15 Oct 2024 10:47:46 GMT",
-            "update_time": 1726649183321
+            "update_time": 1728989266371
        },
        "total": 1
        },
        "total": 0
    }
 }
 ```
 The error response includes a JSON object like the following:
-```shell
+```json
 {
-    "code": 3016,
+    "code": 102,
-    "message": "Can't connect database"
+    "message": "You don't own the document 5c5999ec7be811ef9cab0242ac12000e5."
 }
 ```
@ -908,55 +901,96 @@ Delete document chunks
 ### Request
 - Method: DELETE
- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `chunk_ids`:List[str]
 #### Request example
-```shell
+```bash
 curl --request DELETE \
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
  --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
-     --raw '{
+  --data '{
-         "chunks": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
+  "chunk_ids": ["test_1", "test_2"]
  }'
 ```
 #### Request parameters
 - `"chunk_ids"`:(*Body parameter*)
  The chunks of the document to be deleted
 ### Response
 Success
 ```json
 {
    "code": 0
 }
 ```
 Error
 ```json
 {
    "code": 102,
    "message": "`chunk_ids` is required"
 }
 ```
 ## Update document chunk
-**PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
+**PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}`
 Update document chunk
 ### Request
 - Method: PUT
- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-
+- Body:
  - `content`:str
  - `important_keywords`:str
  - `available`:int
 #### Request example
-```shell
+```bash
 curl --request PUT \
-     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
+  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id} \
  --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+  --header 'Authorization: {YOUR_ACCESS_TOKEN}' \
-     --raw '{
+  --data '{   
        "chunk_id": "d87fb0b7212c15c18d0831677552d7de",  
        "knowledgebase_id": null,  
        "name": "",  
    "content": "ragflow123",  
    "important_keywords": [],   
-        "document_id": "e6bbba92759511efaa900242ac120004",  
+}'
        "status": "1" 
     }'
 ```
 #### Request parameters
 - `"content"`:(*Body parameter*)
  Contains the main text or information of the chunk.
 - `"important_keywords"`:(*Body parameter*)
  list the key terms or phrases that are significant or central to the chunk's content.
 - `"available"`:(*Body parameter*)
   Indicating the availability status, 0 means unavailable and 1 means available.
 ### Response
 Success
 ```json
 {
    "code": 0
 }
 ```
 Error
 ```json
 {
    "code": 102,
    "message": "Can't find this chunk 29a2d9987e16ba331fb4d7d30d99b71d2"
 }
 ```
 ## Insert document chunks
 **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
@ -966,50 +1000,187 @@ Insert document chunks
 ### Request
 - Method: POST
- URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
+- URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-
+- Body:
  - `content`: str
  - `important_keywords`:List[str]
 #### Request example
-```shell
+```bash
 curl --request POST \
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
  --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
-     --raw '{
+  --data '{
-         "document_id": "97ad64b6759811ef9fc30242ac120004",
+    "content": "ragflow content"
-         "content": ["ragflow content", "ragflow content"]
+}'
-     }'
+```
 #### Request parameters
 - `content`:(*Body parameter*)  
  Contains the main text or information of the chunk.
 - `important_keywords`(*Body parameter*)  
  list the key terms or phrases that are significant or central to the chunk's content.
 ### Response
 Success
 ```json
 {
    "code": 0,
    "data": {
        "chunk": {
            "content": "ragflow content",
            "create_time": "2024-10-16 08:05:04",
            "create_timestamp": 1729065904.581025,
            "dataset_id": [
                "c7ee74067a2c11efb21c0242ac120006"
            ],
            "document_id": "5c5999ec7be811ef9cab0242ac120005",
            "id": "d78435d142bd5cf6704da62c778795c5",
            "important_keywords": []
        }
    }
 }
 ```
 Error
 ```json
 {
    "code": 102,
    "message": "`content` is required"
 }
 ```
 ## Dataset retrieval test
-**GET** `/api/v1/dataset/{dataset_id}/retrieval`
+**GET** `/api/v1/retrieval`
 Retrieval test of a dataset
 ### Request
- Method: GET
+- Method: POST
- URL: `/api/v1/dataset/{dataset_id}/retrieval`
+- URL: `http://{address}/api/v1/retrieval`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
-
+- Body:
  - `question`: str  
  - `datasets`: List[str]  
  - `documents`: List[str]
  - `offset`: int  
  - `limit`: int  
  - `similarity_threshold`: float  
  - `vector_similarity_weight`: float  
  - `top_k`: int  
  - `rerank_id`: string  
  - `keyword`: bool  
  - `highlight`: bool
 #### Request example
-```shell
+```bash
-curl --request GET \
+curl --request POST \
-     --url http://{address}/api/v1/dataset/{dataset_id}/retrieval \
+  --url http://{address}/api/v1/retrieval \
  --header 'Content-Type: application/json' \
-     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
+  --header 'Authorization: {YOUR_ACCESS_TOKEN}' \
-     --raw '{
+  --data '{
-         "query_text": "This is a cat."
+    "question": "What is advantage of ragflow?",
-     }'
+    "datasets": [
        "b2a62730759d11ef987d0242ac120004"
    ],
    "documents": [
        "77df9ef4759a11ef8bdd0242ac120004"
    ]
 }'
 ```
 #### Request parameter
 - `"question"`: (*Body parameter*)  
  User's question, search keywords  
  `""`
 - `"datasets"`: (*Body parameter*)  
  The scope of datasets  
  `None`
 - `"documents"`: (*Body parameter*)  
  The scope of document. `None` means no limitation  
  `None`
 - `"offset"`: (*Body parameter*)  
  The beginning point of retrieved records  
  `1`
 - `"limit"`: (*Body parameter*)  
  The maximum number of records needed to return  
  `30`
 - `"similarity_threshold"`: (*Body parameter*)  
  The minimum similarity score  
  `0.2`
 - `"vector_similarity_weight"`: (*Body parameter*)  
  The weight of vector cosine similarity, `1 - x` is the term similarity weight  
  `0.3`
 - `"top_k"`: (*Body parameter*)  
  Number of records engaged in vector cosine computation  
  `1024`
 - `"rerank_id"`: (*Body parameter*)  
  ID of the rerank model  
  `None`
 - `"keyword"`: (*Body parameter*)  
  Whether keyword-based matching is enabled  
  `False`
 - `"highlight"`: (*Body parameter*)  
  Whether to enable highlighting of matched terms in the results  
  `False`
 ### Response
 Success
 ```json
 {
    "code": 0,
    "data": {
        "chunks": [
            {
                "content": "ragflow content",
                "content_ltks": "ragflow content",
                "document_id": "5c5999ec7be811ef9cab0242ac120005",
                "document_keyword": "1.txt",
                "highlight": "<em>ragflow</em> content",
                "id": "d78435d142bd5cf6704da62c778795c5",
                "img_id": "",
                "important_keywords": [
                    ""
                ],
                "kb_id": "c7ee74067a2c11efb21c0242ac120006",
                "positions": [
                    ""
                ],
                "similarity": 0.9669436601210759,
                "term_similarity": 1.0,
                "vector_similarity": 0.8898122004035864
            }
        ],
        "doc_aggs": [
            {
                "count": 1,
                "doc_id": "5c5999ec7be811ef9cab0242ac120005",
                "doc_name": "1.txt"
            }
        ],
        "total": 1
    }
 }
 ```
 Error
 ```json
 {
    "code": 102,
    "message": "`datasets` is required."
 }
 ```
 ## Create chat
 **POST** `/api/v1/chat`
@ -1708,26 +1879,27 @@ Error
 ## Chat with a chat session
-**POST** `/api/v1/chat/{chat_id}/session/{session_id}/completion`
+**POST** `/api/v1/chat/{chat_id}/completion`
 Chat with a chat session
 ### Request
 - Method: POST
- URL: `http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion`
+- URL: `http://{address} /api/v1/chat/{chat_id}/completion`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `question`: string
  - `stream`: bool
  - `session_id`: str
 #### Request example
 ```bash
 curl --request POST \
-  --url http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion \
+  --url http://{address} /api/v1/chat/{chat_id}/completion \
  --header 'Content-Type: application/json' \
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
  --data-binary '{
@ -1743,6 +1915,8 @@ curl --request POST \
 - `stream`: (*Body Parameter*)  
    The approach of streaming text generation.
    `False`
 - `session_id`: (*Body Parameter*)  
    The id of session.If not provided, a new session will be generated.
 ### Response
 Success
 ```json
--- a/api/python_api_reference.md
+++ b/api/python_api_reference.md
@ -244,42 +244,117 @@ File management inside knowledge base
 ## Upload document
 ```python
-RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool
+DataSet.upload_documents(document_list: List[dict])
 ```
 ### Parameters
-#### name
+#### document_list:`List[dict]`
-
+A list composed of dicts containing `name` and `blob`.
 #### blob
 ### Returns
 no return
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.create_dataset(name="kb_1")
 ds.upload_documents([{name="1.txt", blob="123"}, ...] }
 ```
 ---
 ## Update document
 ```python
 Document.update(update_message:dict)
 ```
 ### Parameters
 #### update_message:`dict`  
 only `name`,`parser_config`,`parser_method` can be changed
 ### Returns
 no return
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds=rag.list_datasets(id='id')
 ds=ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 doc.update([{"parser_method": "manual"...}])
 ```
 ---
-## Retrieve document
+## Download document
 ```python
-RAGFlow.get_document(id:str=None,name:str=None) -> Document
+Document.download() -> bytes
 ```
 ### Returns
 bytes of the document.
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds=rag.list_datasets(id="id")
 ds=ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 open("~/ragflow.txt", "wb+").write(doc.download())
 print(doc)
 ```
 ---
 ## List documents
 ```python
 Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document]
 ```
 ### Parameters
-#### id: `str`, *Required*
+#### id: `str`
-ID of the document to retrieve.
+The id of the document to be got
-#### name: `str`
+#### keywords: `str`
-Name or title of the document.
+List documents whose name has the given keywords. Defaults to `None`.
 #### offset: `int`
 The beginning number of records for paging. Defaults to `0`.
 #### limit: `int`
 Records number to return, -1 means all of them. Records number to return, -1 means all of them.
 #### orderby: `str`
 The field by which the records should be sorted. This specifies the attribute or column used to order the results.
 #### desc:`bool`
 A boolean flag indicating whether the sorting should be in descending order.
 ### Returns
 List[Document]  
 A document object containing the following attributes:
 #### id: `str`
@ -352,98 +427,14 @@ Duration of the processing in seconds or minutes. Defaults to `0.0`.
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt')
 print(doc)
 ```
 ---
 ## Save document settings
 ```python
 Document.save() -> bool
 ```
 ### Returns
 bool
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d")
 doc.parser_method= "manual"
 doc.save()
 ```
 ---
 ## Download document
 ```python
 Document.download() -> bytes
 ```
 ### Returns
 bytes of the document.
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d")
 open("~/ragflow.txt", "w+").write(doc.download())
 print(doc) 
 ```
 ---
 ## List documents
 ```python
 Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document]
 ```
 ### Parameters
 #### keywords: `str`
 List documents whose name has the given keywords. Defaults to `None`.
 #### offset: `int`
 The beginning number of records for paging. Defaults to `0`.
 #### limit: `int`
 Records number to return, -1 means all of them. Records number to return, -1 means all of them.
 ### Returns
 List[Document]
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.create_dataset(name="kb_1")
 filename1 = "~/ragflow.txt"
-rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
+blob=open(filename1 , "rb").read()
-
+list_files=[{"name":filename1,"blob":blob}]
-filename2 = "~/infinity.txt"
+ds.upload_documents(list_files)
-rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
+for d in ds.list_documents(keywords="rag", offset=0, limit=12):
 for d in ds.list_docs(keywords="rag", offset=0, limit=12):
    print(d)
 ```
@ -452,12 +443,11 @@ for d in ds.list_docs(keywords="rag", offset=0, limit=12):
 ## Delete documents
 ```python
-Document.delete() -> bool
+DataSet.delete_documents(ids: List[str] = None)
 ```
 ### Returns
-bool
+no return
 description: delete success or not
 ### Examples
@ -465,119 +455,87 @@ description: delete success or not
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-ds = rag.create_dataset(name="kb_1")
+ds = rag.list_datasets(name="kb_1")
-
+ds = ds[0]
-filename1 = "~/ragflow.txt"
+ds.delete_documents(ids=["id_1","id_2"])
 rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
 filename2 = "~/infinity.txt"
 rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
 for d in ds.list_docs(keywords="rag", offset=0, limit=12):
    d.delete()
 ```
 ---
-## Parse document
+## Parse and stop parsing document
 ```python
-Document.async_parse() -> None
+DataSet.async_parse_documents(document_ids:List[str]) -> None
-RAGFLOW.async_parse_documents() -> None
+DataSet.async_cancel_parse_documents(document_ids:List[str])-> None
 ```
 ### Parameters
 #### document_ids:`List[str]`
 The ids of the documents to be parsed
 ????????????????????????????????????????????????????
 ### Returns
-
+no return
 ????????????????????????????????????????????????????
 ### Examples
 ```python
 #document parse and cancel
 rag = RAGFlow(API_KEY, HOST_ADDRESS)
 ds = rag.create_dataset(name="dataset_name")
 name3 = 'ai.pdf'
 path = 'test_data/ai.pdf'
 rag.create_document(ds, name=name3, blob=open(path, "rb").read())
 doc = rag.get_document(name="ai.pdf")
 doc.async_parse()
 print("Async parsing initiated")
 ```
 ---
 ## Cancel document parsing
 ```python
 rag.async_cancel_parse_documents(ids)
 RAGFLOW.async_cancel_parse_documents()-> None
 ```
 ### Parameters
 #### ids, `list[]`
 ### Returns
 ?????????????????????????????????????????????????
 ### Examples
 ```python
 #documents parse and cancel
 rag = RAGFlow(API_KEY, HOST_ADDRESS)
 ds = rag.create_dataset(name="God5")
 documents = [
-    {'name': 'test1.txt', 'path': 'test_data/test1.txt'},
+    {'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()},
-    {'name': 'test2.txt', 'path': 'test_data/test2.txt'},
+    {'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()},
-    {'name': 'test3.txt', 'path': 'test_data/test3.txt'}
+    {'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()}
 ]
-
+ds.upload_documents(documents)
-# Create documents in bulk
+documents=ds.list_documents(keywords="test")
-for doc_info in documents:
+ids=[]
-    with open(doc_info['path'], "rb") as file:
+for document in documents:
-        created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
+    ids.append(document.id)
-docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
+ds.async_parse_documents(ids)
 ids = [doc.id for doc in docs]
 rag.async_parse_documents(ids)
 print("Async bulk parsing initiated")
-
+ds.async_cancel_parse_documents(ids)
 for doc in docs:
    for progress, msg in doc.join(interval=5, timeout=10):
        print(f"{doc.name}: Progress: {progress}, Message: {msg}")
 cancel_result = rag.async_cancel_parse_documents(ids)
 print("Async bulk parsing cancelled")
 ```
---
+## List chunks
 ## Join document
 ??????????????????
 ```python
-Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]]
+Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk]
 ```
 ### Parameters
-#### interval: `int`
+- `keywords`: `str`  
  List chunks whose name has the given keywords  
  default: `None`
-Time interval in seconds for progress report. Defaults to `15`.
+- `offset`: `int`  
  The beginning number of records for paging  
  default: `1`
-#### timeout: `int`
+- `limit`: `int`  
-
+  Records number to return  
-Timeout in seconds. Defaults to `3600`.
+  default: `30`
 - `id`: `str`  
  The ID of the chunk to be retrieved  
  default: `None`
 ### Returns
 List[chunk]
-iteral[Tuple[float, str]]
+### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.list_datasets("123")
 ds = ds[0]
 ds.async_parse_documents(["wdfxb5t547d"])
 for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
    print(c)
 ```
 ## Add chunk
 ```python
@ -587,6 +545,9 @@ Document.add_chunk(content:str) -> Chunk
 ### Parameters
 #### content: `str`, *Required*
 Contains the main text or information of the chunk.
 #### important_keywords :`List[str]`
 list the key terms or phrases that are significant or central to the chunk's content.
 ### Returns
@ -598,7 +559,10 @@ chunk
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
+ds = rag.list_datasets(id="123")
 ds = ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
 ```
@ -607,12 +571,15 @@ chunk = doc.add_chunk(content="xxxxxxx")
 ## Delete chunk
 ```python
-Chunk.delete() -> bool
+Document.delete_chunks(chunk_ids: List[str])
 ```
 ### Parameters
 #### chunk_ids:`List[str]`
 The list of chunk_id
 ### Returns
-bool
+no return
 ### Examples
@ -620,22 +587,34 @@ bool
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
+ds = rag.list_datasets(id="123")
 ds = ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
-chunk.delete()
+doc.delete_chunks(["id_1","id_2"])
 ```
 ---
-## Save chunk contents
+## Update chunk
 ```python
-Chunk.save() -> bool
+Chunk.update(update_message: dict)
 ```
 ### Parameters
 - `content`: `str`  
  Contains the main text or information of the chunk
 - `important_keywords`: `List[str]`  
  List the key terms or phrases that are significant or central to the chunk's content
 - `available`: `int`  
  Indicating the availability status, `0` means unavailable and `1` means available
 ### Returns
-bool
+no return
 ### Examples
@ -643,10 +622,12 @@ bool
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-doc = rag.get_document(id="wdfxb5t547d")
+ds = rag.list_datasets(id="123")
 ds = ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
-chunk.content = "sdfx"
+chunk.update({"content":"sdfx...})
 chunk.save()
 ```
 ---
@ -654,7 +635,7 @@ chunk.save()
 ## Retrieval
 ```python
-RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None,     offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk]
+RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk]
 ```
 ### Parameters
@ -691,6 +672,15 @@ The weight of vector cosine similarity, 1 - x is the term similarity weight. Def
 Number of records engaged in vector cosine computaton. Defaults to `1024`.
 #### rerank_id:`str`
 ID of the rerank model.  Defaults to `None`.
 #### keyword:`bool`
 Indicating whether keyword-based matching is enabled (True) or disabled (False).
 #### highlight:`bool`
 Specifying whether to enable highlighting of matched terms in the results (True) or not (False).
 ### Returns
 List[Chunk]
@ -701,18 +691,17 @@ List[Chunk]
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
-ds = rag.get_dataset(name="ragflow")
+ds = rag.list_datasets(name="ragflow")
 ds = ds[0]
 name = 'ragflow_test.txt'
-path = 'test_data/ragflow_test.txt'
+path = './test_data/ragflow_test.txt'
 rag.create_document(ds, name=name, blob=open(path, "rb").read())
-doc = rag.get_document(name=name)
+doc = ds.list_documents(name=name)
-doc.async_parse()
+doc = doc[0]
-# Wait for parsing to complete 
+ds.async_parse_documents([doc.id])
-for progress, msg in doc.join(interval=5, timeout=30):
+for c in rag.retrieve(question="What's ragflow?", 
-    print(progress, msg)
+             datasets=[ds.id], documents=[doc.id], 
-for c in rag.retrieval(question="What's ragflow?", 
+             offset=1, limit=30, similarity_threshold=0.2, 
             datasets=[ds], documents=[doc], 
             offset=0, limit=6, similarity_threshold=0.1, 
             vector_similarity_weight=0.3,
             top_k=1024
             ):
--- a/sdk/python/ragflow/modules/chunk.py
+++ b/sdk/python/ragflow/modules/chunk.py
@ -17,32 +17,11 @@ class Chunk(Base):
                res_dict.pop(k)
        super().__init__(rag, res_dict)
    def delete(self) -> bool:
        """
        Delete the chunk in the document.
        """
        res = self.post('/doc/chunk/rm',
                        {"document_id": self.document_id, 'chunk_ids': [self.id]})
        res = res.json()
        if res.get("retmsg") == "success":
            return True
        raise Exception(res["retmsg"])
-    def save(self) -> bool:
+    def update(self,update_message:dict):
-        """
+        res = self.put(f"/dataset/{self.knowledgebase_id}/document/{self.document_id}/chunk/{self.id}",update_message)
        Save the document details to the server.
        """
        res = self.post('/doc/chunk/set',
                        {"chunk_id": self.id,
                         "knowledgebase_id": self.knowledgebase_id,
                         "name": self.document_name,
                         "content": self.content,
                         "important_keywords": self.important_keywords,
                         "document_id": self.document_id,
                         "available": self.available,
                         })
        res = res.json()
-        if res.get("retmsg") == "success":
+        if res.get("code") != 0 :
-            return True
+            raise Exception(res["message"])
-        raise Exception(res["retmsg"])
+
--- a/sdk/python/ragflow/modules/dataset.py
+++ b/sdk/python/ragflow/modules/dataset.py
@ -65,3 +65,14 @@ class DataSet(Base):
        if res.get("code") != 0:
            raise Exception(res["message"])
    def async_parse_documents(self,document_ids):
        res = self.post(f"/dataset/{self.id}/chunk",{"document_ids":document_ids})
        res = res.json()
        if res.get("code") != 0:
            raise Exception(res.get("message"))
    def async_cancel_parse_documents(self,document_ids):
        res = self.rm(f"/dataset/{self.id}/chunk",{"document_ids":document_ids})
        res = res.json()
        if res.get("code") != 0:
            raise Exception(res.get("message"))
--- a/sdk/python/ragflow/modules/document.py
+++ b/sdk/python/ragflow/modules/document.py
@ -1,7 +1,10 @@
 import time
 from PIL.ImageFile import raise_oserror
 from .base import Base
 from .chunk import Chunk
 from typing import List
 class Document(Base):
@ -29,160 +32,28 @@ class Document(Base):
                res_dict.pop(k)
        super().__init__(rag, res_dict)
-    def update(self,update_message:dict) -> bool:
+    def list_chunks(self,offset=0, limit=30, keywords="", id:str=None):
-        """
+        data={"document_id": self.id,"keywords": keywords,"offset":offset,"limit":limit,"id":id}
-        Save the document details to the server.
+        res = self.get(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', data)
        """
        res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message)
        res = res.json()
-        if res.get("code") != 0:
+        if res.get("code") == 0:
            raise Exception(res["message"])
    def delete(self) -> bool:
        """
        Delete the document from the server.
        """
        res = self.rm('/doc/delete',
                      {"document_id": self.id})
        res = res.json()
        if res.get("retmsg") == "success":
            return True
        raise Exception(res["retmsg"])
    def download(self) -> bytes:
        """
        Download the document content from the server using the Flask API.
        :return: The downloaded document content in bytes.
        """
        # Construct the URL for the API request using the document ID and knowledge base ID
        res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}")
        # Check the response status code to ensure the request was successful
        if res.status_code == 200:
            # Return the document content as bytes
            return res.content
        else:
            # Handle the error and raise an exception
            raise Exception(
                f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
            )
    def async_parse(self):
        """
        Initiate document parsing asynchronously without waiting for completion.
        """
        try:
            # Construct request data including document ID and run status (assuming 1 means to run)
            data = {"document_ids": [self.id], "run": 1}
            # Send a POST request to the specified parsing status endpoint to start parsing
            res = self.post(f'/doc/run', data)
            # Check the server response status code
            if res.status_code != 200:
                raise Exception(f"Failed to start async parsing: {res.text}")
            print("Async parsing started successfully.")
        except Exception as e:
            # Catch and handle exceptions
            print(f"Error occurred during async parsing: {str(e)}")
            raise
    import time
    def join(self, interval=5, timeout=3600):
        """
        Wait for the asynchronous parsing to complete and yield parsing progress periodically.
        :param interval: The time interval (in seconds) for progress reports.
        :param timeout: The timeout (in seconds) for the parsing operation.
        :return: An iterator yielding parsing progress and messages.
        """
        start_time = time.time()
        while time.time() - start_time < timeout:
            # Check the parsing status
            res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]})
            res_data = res.json()
            data = res_data.get("data", [])
            # Retrieve progress and status message
            progress = data.get("progress", 0)
            progress_msg = data.get("status", "")
            yield progress, progress_msg  # Yield progress and message
            if progress == 100:  # Parsing completed
                break
            time.sleep(interval)
    def cancel(self):
        """
        Cancel the parsing task for the document.
        """
        try:
            # Construct request data, including document ID and action to cancel (assuming 2 means cancel)
            data = {"document_ids": [self.id], "run": 2}
            # Send a POST request to the specified parsing status endpoint to cancel parsing
            res = self.post(f'/doc/run', data)
            # Check the server response status code
            if res.status_code != 200:
                print("Failed to cancel parsing. Server response:", res.text)
            else:
                print("Parsing cancelled successfully.")
        except Exception as e:
            print(f"Error occurred during async parsing cancellation: {str(e)}")
            raise
    def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None):
        """
        List all chunks associated with this document by calling the external API.
        Args:
            page (int): The page number to retrieve (default 1).
            size (int): The number of chunks per page (default 30).
            keywords (str): Keywords for searching specific chunks (default "").
            available_int (int): Filter for available chunks (optional).
        Returns:
            list: A list of chunks returned from the API.
        """
        data = {
            "document_id": self.id,
            "page": page,
            "size": size,
            "keywords": keywords,
            "offset":offset,
            "limit":limit
        }
        if available_int is not None:
            data["available_int"] = available_int
        res = self.post(f'/doc/chunk/list', data)
        if res.status_code == 200:
            res_data = res.json()
            if res_data.get("retmsg") == "success":
            chunks=[]
-                for chunk_data in res_data["data"].get("chunks", []):
+            for data in res["data"].get("chunks"):
-                    chunk=Chunk(self.rag,chunk_data)
+                chunk = Chunk(self.rag,data)
                chunks.append(chunk)
            return chunks
-            else:
+        raise Exception(res.get("message"))
-                raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
+
        else:
            raise Exception(f"API request failed with status code {res.status_code}")
    def add_chunk(self, content: str):
-        res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content})
+        res = self.post(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', {"content":content})
-        if res.status_code == 200:
+        res = res.json()
-            res_data = res.json().get("data")
+        if res.get("code") == 0:
-            chunk_data = res_data.get("chunk")
+            return Chunk(self.rag,res["data"].get("chunk"))
-            return Chunk(self.rag,chunk_data)
+        raise Exception(res.get("message"))
-        else:
+
-            raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")
+    def delete_chunks(self,ids:List[str]):
        res = self.rm(f"dataset/{self.knowledgebase_id}/document/{self.id}/chunk",{"ids":ids})
        res = res.json()
        if res.get("code")!=0:
            raise Exception(res.get("message"))
--- a/sdk/python/ragflow/modules/session.py
+++ b/sdk/python/ragflow/modules/session.py
@ -15,8 +15,8 @@ class Session(Base):
        for message in self.messages:
            if "reference" in message:
                message.pop("reference")
-        res = self.post(f"/chat/{self.chat_id}/session/{self.id}/completion",
+        res = self.post(f"/chat/{self.chat_id}/completion",
-                        {"question": question, "stream": True}, stream=stream)
+                        {"question": question, "stream": True,"session_id":self.id}, stream=stream)
        for line in res.iter_lines():
            line = line.decode("utf-8")
            if line.startswith("{"):
@ -82,3 +82,4 @@ class Chunk(Base):
        self.term_similarity = None
        self.positions = None
        super().__init__(rag, res_dict)
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@ -158,105 +158,30 @@ class RAGFlow:
        raise Exception(res["message"])
-
+    def retrieve(self, question="",datasets=None,documents=None, offset=1, limit=30, similarity_threshold=0.2,vector_similarity_weight=0.3,top_k=1024,rerank_id:str=None,keyword:bool=False,):
-    def async_parse_documents(self, doc_ids):
+            data_params = {
        """
        Asynchronously start parsing multiple documents without waiting for completion.
        :param doc_ids: A list containing multiple document IDs.
        """
        try:
            if not doc_ids or not isinstance(doc_ids, list):
                raise ValueError("doc_ids must be a non-empty list of document IDs")
            data = {"document_ids": doc_ids, "run": 1}
            res = self.post(f'/doc/run', data)
            if res.status_code != 200:
                raise Exception(f"Failed to start async parsing for documents: {res.text}")
            print(f"Async parsing started successfully for documents: {doc_ids}")
        except Exception as e:
            print(f"Error occurred during async parsing for documents: {str(e)}")
            raise
    def async_cancel_parse_documents(self, doc_ids):
        """
        Cancel the asynchronous parsing of multiple documents.
        :param doc_ids: A list containing multiple document IDs.
        """
        try:
            if not doc_ids or not isinstance(doc_ids, list):
                raise ValueError("doc_ids must be a non-empty list of document IDs")
            data = {"document_ids": doc_ids, "run": 2}
            res = self.post(f'/doc/run', data)
            if res.status_code != 200:
                raise Exception(f"Failed to cancel async parsing for documents: {res.text}")
            print(f"Async parsing canceled successfully for documents: {doc_ids}")
        except Exception as e:
            print(f"Error occurred during canceling parsing for documents: {str(e)}")
            raise
    def retrieval(self,
                  question,
                  datasets=None,
                  documents=None,
                  offset=0,
                  limit=6,
                  similarity_threshold=0.1,
                  vector_similarity_weight=0.3,
                  top_k=1024):
        """
        Perform document retrieval based on the given parameters.
        :param question: The query question.
        :param datasets: A list of datasets (optional, as documents may be provided directly).
        :param documents: A list of documents (if specific documents are provided).
        :param offset: Offset for the retrieval results.
        :param limit: Maximum number of retrieval results.
        :param similarity_threshold: Similarity threshold.
        :param vector_similarity_weight: Weight of vector similarity.
        :param top_k: Number of top most similar documents to consider (for pre-filtering or ranking).
        Note: This is a hypothetical implementation and may need adjustments based on the actual backend service API.
        """
        try:
            data = {
                "question": question,
                "datasets": datasets if datasets is not None else [],
                "documents": [doc.id if hasattr(doc, 'id') else doc for doc in
                              documents] if documents is not None else [],
                "offset": offset,
                "limit": limit,
                "similarity_threshold": similarity_threshold,
                "vector_similarity_weight": vector_similarity_weight,
                "top_k": top_k,
                "knowledgebase_id": datasets,
                "rerank_id":rerank_id,
                "keyword":keyword
            }
            data_json ={
                "question": question,
                "datasets": datasets,
                "documents": documents
            }
            # Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
-            res = self.post(f'/doc/retrieval_test', data)
+            res = self.get(f'/retrieval', data_params,data_json)
-
+            res = res.json()
-            # Check the response status code
+            if res.get("code") ==0:
-            if res.status_code == 200:
+                chunks=[]
-                res_data = res.json()
+                for chunk_data in res["data"].get("chunks"):
-                if res_data.get("retmsg") == "success":
+                    chunk=Chunk(self,chunk_data)
                    chunks = []
                    for chunk_data in res_data["data"].get("chunks", []):
                        chunk = Chunk(self, chunk_data)
                    chunks.append(chunk)
                return chunks
-                else:
+            raise Exception(res.get("message"))
                    raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
            else:
                raise Exception(f"API request failed with status code {res.status_code}")
        except Exception as e:
            print(f"An error occurred during retrieval: {e}")
            raise
--- a/sdk/python/test/t_document.py
+++ b/sdk/python/test/t_document.py
@ -63,17 +63,13 @@ class TestDocument(TestSdk):
        # Check if the retrieved document is of type Document
        if isinstance(doc, Document):
            # Download the document content and save it to a file
-            try:
+            with open("./ragflow.txt", "wb+") as file:
                with open("ragflow.txt", "wb+") as file:
                file.write(doc.download())
                # Print the document object for debugging
            print(doc)
            # Assert that the download was successful
-                assert True, "Document downloaded successfully."
+            assert True, f"Failed to download document, error: {doc}"
            except Exception as e:
                # If an error occurs, raise an assertion error
                assert False, f"Failed to download document, error: {str(e)}"
        else:
            # If the document retrieval fails, assert failure
            assert False, f"Failed to get document, error: {doc}"
@ -100,7 +96,7 @@ class TestDocument(TestSdk):
        blob2 = b"Sample document content for ingestion test222."
        list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
        ds.upload_documents(list_1)
-        for d in ds.list_docs(keywords="test", offset=0, limit=12):
+        for d in ds.list_documents(keywords="test", offset=0, limit=12):
            assert isinstance(d, Document), "Failed to upload documents"
    def test_delete_documents_in_dataset_with_success(self):
@ -123,16 +119,11 @@ class TestDocument(TestSdk):
        blob1 = b"Sample document content for ingestion test333."
        name2 = "Test Document444.txt"
        blob2 = b"Sample document content for ingestion test444."
-        name3 = 'test.txt'
+        ds.upload_documents([{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}])
-        path = 'test_data/test.txt'
+        for d in ds.list_documents(keywords="document", offset=0, limit=12):
        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
        rag.create_document(ds, name=name1, blob=blob1)
        rag.create_document(ds, name=name2, blob=blob2)
        for d in ds.list_docs(keywords="document", offset=0, limit=12):
            assert isinstance(d, Document)
-            d.delete()
+            ds.delete_documents([d.id])
-            print(d)
+        remaining_docs = ds.list_documents(keywords="rag", offset=0, limit=12)
        remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
        assert len(remaining_docs) == 0, "Documents were not properly deleted."
    def test_parse_and_cancel_document(self):
@ -144,16 +135,15 @@ class TestDocument(TestSdk):
        # Define the document name and path
        name3 = 'westworld.pdf'
-        path = 'test_data/westworld.pdf'
+        path = './test_data/westworld.pdf'
        # Create a document in the dataset using the file path
-        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
+        ds.upload_documents({"name":name3, "blob":open(path, "rb").read()})
        # Retrieve the document by name
-        doc = rag.get_document(name="westworld.pdf")
+        doc = rag.list_documents(name="westworld.pdf")
-
+        doc = doc[0]
-        # Initiate asynchronous parsing
+        ds.async_parse_documents(document_ids=[])
        doc.async_parse()
        # Print message to confirm asynchronous parsing has been initiated
        print("Async parsing initiated")