mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 04:26:05 +08:00
Replaced md5 with xxhash64 for chunk id (#4009)
### What problem does this PR solve? Replaced md5 with xxhash64 for chunk id ### Type of change - [x] Refactoring
This commit is contained in:
parent
301f95837c
commit
c8b1a564aa
@ -31,7 +31,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, va
|
|||||||
from api.db.services.document_service import DocumentService
|
from api.db.services.document_service import DocumentService
|
||||||
from api import settings
|
from api import settings
|
||||||
from api.utils.api_utils import get_json_result
|
from api.utils.api_utils import get_json_result
|
||||||
import hashlib
|
import xxhash
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
@ -208,9 +208,7 @@ def rm():
|
|||||||
@validate_request("doc_id", "content_with_weight")
|
@validate_request("doc_id", "content_with_weight")
|
||||||
def create():
|
def create():
|
||||||
req = request.json
|
req = request.json
|
||||||
md5 = hashlib.md5()
|
chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest()
|
||||||
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
|
||||||
chunck_id = md5.hexdigest()
|
|
||||||
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
||||||
"content_with_weight": req["content_with_weight"]}
|
"content_with_weight": req["content_with_weight"]}
|
||||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
|
@ -22,7 +22,7 @@ from rag.nlp import rag_tokenizer
|
|||||||
from api.db import LLMType, ParserType
|
from api.db import LLMType, ParserType
|
||||||
from api.db.services.llm_service import TenantLLMService
|
from api.db.services.llm_service import TenantLLMService
|
||||||
from api import settings
|
from api import settings
|
||||||
import hashlib
|
import xxhash
|
||||||
import re
|
import re
|
||||||
from api.utils.api_utils import token_required
|
from api.utils.api_utils import token_required
|
||||||
from api.db.db_models import Task
|
from api.db.db_models import Task
|
||||||
@ -984,10 +984,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
|||||||
return get_error_data_result(
|
return get_error_data_result(
|
||||||
"`questions` is required to be a list"
|
"`questions` is required to be a list"
|
||||||
)
|
)
|
||||||
md5 = hashlib.md5()
|
chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest()
|
||||||
md5.update((req["content"] + document_id).encode("utf-8"))
|
|
||||||
|
|
||||||
chunk_id = md5.hexdigest()
|
|
||||||
d = {
|
d = {
|
||||||
"id": chunk_id,
|
"id": chunk_id,
|
||||||
"content_ltks": rag_tokenizer.tokenize(req["content"]),
|
"content_ltks": rag_tokenizer.tokenize(req["content"]),
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import logging
|
import logging
|
||||||
import hashlib
|
import xxhash
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
@ -508,10 +508,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
|||||||
for ck in th.result():
|
for ck in th.result():
|
||||||
d = deepcopy(doc)
|
d = deepcopy(doc)
|
||||||
d.update(ck)
|
d.update(ck)
|
||||||
md5 = hashlib.md5()
|
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
||||||
md5.update((ck["content_with_weight"] +
|
|
||||||
str(d["doc_id"])).encode("utf-8"))
|
|
||||||
d["id"] = md5.hexdigest()
|
|
||||||
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
||||||
d["create_timestamp_flt"] = datetime.now().timestamp()
|
d["create_timestamp_flt"] = datetime.now().timestamp()
|
||||||
if not d.get("image"):
|
if not d.get("image"):
|
||||||
|
@ -35,17 +35,13 @@ from api import settings
|
|||||||
from rag.nlp import search
|
from rag.nlp import search
|
||||||
|
|
||||||
def trim_header_by_lines(text: str, max_length) -> str:
|
def trim_header_by_lines(text: str, max_length) -> str:
|
||||||
if len(text) <= max_length:
|
len_text = len(text)
|
||||||
|
if len_text <= max_length:
|
||||||
|
return text
|
||||||
|
for i in range(len_text):
|
||||||
|
if text[i] == '\n' and len_text - i <= max_length:
|
||||||
|
return text[i+1:]
|
||||||
return text
|
return text
|
||||||
lines = text.split("\n")
|
|
||||||
total = 0
|
|
||||||
idx = len(lines) - 1
|
|
||||||
for i in range(len(lines)-1, -1, -1):
|
|
||||||
if total + len(lines[i]) > max_length:
|
|
||||||
break
|
|
||||||
idx = i
|
|
||||||
text2 = "\n".join(lines[idx:])
|
|
||||||
return text2
|
|
||||||
|
|
||||||
class TaskService(CommonService):
|
class TaskService(CommonService):
|
||||||
model = Task
|
model = Task
|
||||||
@ -183,7 +179,7 @@ class TaskService(CommonService):
|
|||||||
if os.environ.get("MACOS"):
|
if os.environ.get("MACOS"):
|
||||||
if info["progress_msg"]:
|
if info["progress_msg"]:
|
||||||
task = cls.model.get_by_id(id)
|
task = cls.model.get_by_id(id)
|
||||||
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
|
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
|
||||||
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
||||||
if "progress" in info:
|
if "progress" in info:
|
||||||
cls.model.update(progress=info["progress"]).where(
|
cls.model.update(progress=info["progress"]).where(
|
||||||
@ -194,7 +190,7 @@ class TaskService(CommonService):
|
|||||||
with DB.lock("update_progress", -1):
|
with DB.lock("update_progress", -1):
|
||||||
if info["progress_msg"]:
|
if info["progress_msg"]:
|
||||||
task = cls.model.get_by_id(id)
|
task = cls.model.get_by_id(id)
|
||||||
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
|
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
|
||||||
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
||||||
if "progress" in info:
|
if "progress" in info:
|
||||||
cls.model.update(progress=info["progress"]).where(
|
cls.model.update(progress=info["progress"]).where(
|
||||||
|
@ -27,7 +27,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import xxhash
|
||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
@ -226,10 +226,7 @@ def build_chunks(task, progress_callback):
|
|||||||
for ck in cks:
|
for ck in cks:
|
||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
d.update(ck)
|
d.update(ck)
|
||||||
md5 = hashlib.md5()
|
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
||||||
md5.update((ck["content_with_weight"] +
|
|
||||||
str(d["doc_id"])).encode("utf-8"))
|
|
||||||
d["id"] = md5.hexdigest()
|
|
||||||
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
||||||
d["create_timestamp_flt"] = datetime.now().timestamp()
|
d["create_timestamp_flt"] = datetime.now().timestamp()
|
||||||
if not d.get("image"):
|
if not d.get("image"):
|
||||||
@ -368,9 +365,7 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
|
|||||||
tk_count = 0
|
tk_count = 0
|
||||||
for content, vctr in chunks[original_length:]:
|
for content, vctr in chunks[original_length:]:
|
||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
md5 = hashlib.md5()
|
d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
||||||
md5.update((content + str(d["doc_id"])).encode("utf-8"))
|
|
||||||
d["id"] = md5.hexdigest()
|
|
||||||
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
||||||
d["create_timestamp_flt"] = datetime.now().timestamp()
|
d["create_timestamp_flt"] = datetime.now().timestamp()
|
||||||
d[vctr_nm] = vctr.tolist()
|
d[vctr_nm] = vctr.tolist()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user