Replaced md5 with xxhash64 for chunk id (#4009)

### What problem does this PR solve?

Replaced md5 with xxhash64 for chunk id

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu 2024-12-12 17:47:39 +08:00 committed by GitHub
parent 301f95837c
commit c8b1a564aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 17 additions and 34 deletions

View File

@ -31,7 +31,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, va
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api import settings from api import settings
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result
import hashlib import xxhash
import re import re
@ -208,9 +208,7 @@ def rm():
@validate_request("doc_id", "content_with_weight") @validate_request("doc_id", "content_with_weight")
def create(): def create():
req = request.json req = request.json
md5 = hashlib.md5() chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest()
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
chunck_id = md5.hexdigest()
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
"content_with_weight": req["content_with_weight"]} "content_with_weight": req["content_with_weight"]}
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])

View File

@ -22,7 +22,7 @@ from rag.nlp import rag_tokenizer
from api.db import LLMType, ParserType from api.db import LLMType, ParserType
from api.db.services.llm_service import TenantLLMService from api.db.services.llm_service import TenantLLMService
from api import settings from api import settings
import hashlib import xxhash
import re import re
from api.utils.api_utils import token_required from api.utils.api_utils import token_required
from api.db.db_models import Task from api.db.db_models import Task
@ -984,10 +984,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
return get_error_data_result( return get_error_data_result(
"`questions` is required to be a list" "`questions` is required to be a list"
) )
md5 = hashlib.md5() chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest()
md5.update((req["content"] + document_id).encode("utf-8"))
chunk_id = md5.hexdigest()
d = { d = {
"id": chunk_id, "id": chunk_id,
"content_ltks": rag_tokenizer.tokenize(req["content"]), "content_ltks": rag_tokenizer.tokenize(req["content"]),

View File

@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
# #
import logging import logging
import hashlib import xxhash
import json import json
import random import random
import re import re
@ -508,10 +508,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
for ck in th.result(): for ck in th.result():
d = deepcopy(doc) d = deepcopy(doc)
d.update(ck) d.update(ck)
md5 = hashlib.md5() d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19] d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp() d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"): if not d.get("image"):

View File

@ -35,17 +35,13 @@ from api import settings
from rag.nlp import search from rag.nlp import search
def trim_header_by_lines(text: str, max_length) -> str: def trim_header_by_lines(text: str, max_length) -> str:
if len(text) <= max_length: len_text = len(text)
if len_text <= max_length:
return text
for i in range(len_text):
if text[i] == '\n' and len_text - i <= max_length:
return text[i+1:]
return text return text
lines = text.split("\n")
total = 0
idx = len(lines) - 1
for i in range(len(lines)-1, -1, -1):
if total + len(lines[i]) > max_length:
break
idx = i
text2 = "\n".join(lines[idx:])
return text2
class TaskService(CommonService): class TaskService(CommonService):
model = Task model = Task
@ -183,7 +179,7 @@ class TaskService(CommonService):
if os.environ.get("MACOS"): if os.environ.get("MACOS"):
if info["progress_msg"]: if info["progress_msg"]:
task = cls.model.get_by_id(id) task = cls.model.get_by_id(id)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000) progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute() cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
if "progress" in info: if "progress" in info:
cls.model.update(progress=info["progress"]).where( cls.model.update(progress=info["progress"]).where(
@ -194,7 +190,7 @@ class TaskService(CommonService):
with DB.lock("update_progress", -1): with DB.lock("update_progress", -1):
if info["progress_msg"]: if info["progress_msg"]:
task = cls.model.get_by_id(id) task = cls.model.get_by_id(id)
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000) progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute() cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
if "progress" in info: if "progress" in info:
cls.model.update(progress=info["progress"]).where( cls.model.update(progress=info["progress"]).where(

View File

@ -27,7 +27,7 @@ import logging
import os import os
from datetime import datetime from datetime import datetime
import json import json
import hashlib import xxhash
import copy import copy
import re import re
import time import time
@ -226,10 +226,7 @@ def build_chunks(task, progress_callback):
for ck in cks: for ck in cks:
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
d.update(ck) d.update(ck)
md5 = hashlib.md5() d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19] d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp() d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"): if not d.get("image"):
@ -368,9 +365,7 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
tk_count = 0 tk_count = 0
for content, vctr in chunks[original_length:]: for content, vctr in chunks[original_length:]:
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
md5 = hashlib.md5() d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest()
md5.update((content + str(d["doc_id"])).encode("utf-8"))
d["id"] = md5.hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19] d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp() d["create_timestamp_flt"] = datetime.now().timestamp()
d[vctr_nm] = vctr.tolist() d[vctr_nm] = vctr.tolist()