Fix: fix document concurrent upload issue (#6095)

### What problem does this PR solve?

Resolve document concurrent upload issue. #6039 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei 2025-03-14 16:31:44 +08:00 committed by GitHub
parent 9d94acbedb
commit d7774cf049
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 68 additions and 59 deletions

View File

@ -13,33 +13,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import xxhash
import json
import logging
import random
import re
from concurrent.futures import ThreadPoolExecutor
from copy import deepcopy
from datetime import datetime
from io import BytesIO
import trio
import trio
import xxhash
from peewee import fn
from api.db.db_utils import bulk_insert_into_db
from api import settings
from api.utils import current_timestamp, get_format_time, get_uuid
from rag.settings import SVR_QUEUE_NAME
from rag.utils.storage_factory import STORAGE_IMPL
from rag.nlp import search, rag_tokenizer
from api.db import FileType, TaskStatus, ParserType, LLMType
from api.db.db_models import DB, Knowledgebase, Tenant, Task, UserTenant
from api.db.db_models import Document
from api.db import FileType, LLMType, ParserType, StatusEnum, TaskStatus
from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant
from api.db.db_utils import bulk_insert_into_db
from api.db.services.common_service import CommonService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db import StatusEnum
from api.utils import current_timestamp, get_format_time, get_uuid
from rag.nlp import rag_tokenizer, search
from rag.settings import SVR_QUEUE_NAME
from rag.utils.redis_conn import REDIS_CONN
from rag.utils.storage_factory import STORAGE_IMPL
class DocumentService(CommonService):
@ -96,9 +93,7 @@ class DocumentService(CommonService):
def insert(cls, doc):
if not cls.save(**doc):
raise RuntimeError("Database error (Document)!")
e, kb = KnowledgebaseService.get_by_id(doc["kb_id"])
if not KnowledgebaseService.update_by_id(
kb.id, {"doc_num": kb.doc_num + 1}):
if not KnowledgebaseService.atomic_increase_doc_num_by_id(doc["kb_id"]):
raise RuntimeError("Database error (Knowledgebase)!")
return Document(**doc)
@ -174,9 +169,9 @@ class DocumentService(CommonService):
"Document not found which is supposed to be there")
num = Knowledgebase.update(
token_num=Knowledgebase.token_num +
token_num,
token_num,
chunk_num=Knowledgebase.chunk_num +
chunk_num).where(
chunk_num).where(
Knowledgebase.id == kb_id).execute()
return num
@ -192,9 +187,9 @@ class DocumentService(CommonService):
"Document not found which is supposed to be there")
num = Knowledgebase.update(
token_num=Knowledgebase.token_num -
token_num,
token_num,
chunk_num=Knowledgebase.chunk_num -
chunk_num
chunk_num
).where(
Knowledgebase.id == kb_id).execute()
return num
@ -207,9 +202,9 @@ class DocumentService(CommonService):
num = Knowledgebase.update(
token_num=Knowledgebase.token_num -
doc.token_num,
doc.token_num,
chunk_num=Knowledgebase.chunk_num -
doc.chunk_num,
doc.chunk_num,
doc_num=Knowledgebase.doc_num - 1
).where(
Knowledgebase.id == doc.kb_id).execute()
@ -221,7 +216,7 @@ class DocumentService(CommonService):
docs = cls.model.select(
Knowledgebase.tenant_id).join(
Knowledgebase, on=(
Knowledgebase.id == cls.model.kb_id)).where(
Knowledgebase.id == cls.model.kb_id)).where(
cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value)
docs = docs.dicts()
if not docs:
@ -243,7 +238,7 @@ class DocumentService(CommonService):
docs = cls.model.select(
Knowledgebase.tenant_id).join(
Knowledgebase, on=(
Knowledgebase.id == cls.model.kb_id)).where(
Knowledgebase.id == cls.model.kb_id)).where(
cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value)
docs = docs.dicts()
if not docs:
@ -256,7 +251,7 @@ class DocumentService(CommonService):
docs = cls.model.select(
cls.model.id).join(
Knowledgebase, on=(
Knowledgebase.id == cls.model.kb_id)
Knowledgebase.id == cls.model.kb_id)
).join(UserTenant, on=(UserTenant.tenant_id == Knowledgebase.tenant_id)
).where(cls.model.id == doc_id, UserTenant.user_id == user_id).paginate(0, 1)
docs = docs.dicts()
@ -270,7 +265,7 @@ class DocumentService(CommonService):
docs = cls.model.select(
cls.model.id).join(
Knowledgebase, on=(
Knowledgebase.id == cls.model.kb_id)
Knowledgebase.id == cls.model.kb_id)
).where(cls.model.id == doc_id, Knowledgebase.created_by == user_id).paginate(0, 1)
docs = docs.dicts()
if not docs:
@ -283,7 +278,7 @@ class DocumentService(CommonService):
docs = cls.model.select(
Knowledgebase.embd_id).join(
Knowledgebase, on=(
Knowledgebase.id == cls.model.kb_id)).where(
Knowledgebase.id == cls.model.kb_id)).where(
cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value)
docs = docs.dicts()
if not docs:
@ -306,9 +301,9 @@ class DocumentService(CommonService):
Tenant.asr_id,
Tenant.llm_id,
)
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
.where(cls.model.id == doc_id)
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
.where(cls.model.id == doc_id)
)
configs = configs.dicts()
if not configs:
@ -374,6 +369,7 @@ class DocumentService(CommonService):
"progress_msg": "Task is queued...",
"process_begin_at": get_format_time()
})
@classmethod
@DB.connection_context()
def update_meta_fields(cls, doc_id, meta_fields):
@ -425,7 +421,7 @@ class DocumentService(CommonService):
info = {
"process_duation": datetime.timestamp(
datetime.now()) -
d["process_begin_at"].timestamp(),
d["process_begin_at"].timestamp(),
"run": status}
if prg != 0:
info["progress"] = prg
@ -480,13 +476,13 @@ def queue_raptor_o_graphrag_tasks(doc, ty):
def doc_upload_and_parse(conversation_id, file_objs, user_id):
from rag.app import presentation, picture, naive, audio, email
from api.db.services.api_service import API4ConversationService
from api.db.services.conversation_service import ConversationService
from api.db.services.dialog_service import DialogService
from api.db.services.file_service import FileService
from api.db.services.llm_service import LLMBundle
from api.db.services.user_service import TenantService
from api.db.services.api_service import API4ConversationService
from api.db.services.conversation_service import ConversationService
from rag.app import audio, email, naive, picture, presentation
e, conv = ConversationService.get_by_id(conversation_id)
if not e:

View File

@ -13,11 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from api.db import StatusEnum, TenantPermission
from api.db.db_models import Knowledgebase, DB, Tenant, User, UserTenant,Document
from api.db.services.common_service import CommonService
from datetime import datetime
from peewee import fn
from api.db import StatusEnum, TenantPermission
from api.db.db_models import DB, Document, Knowledgebase, Tenant, User, UserTenant
from api.db.services.common_service import CommonService
from api.utils import current_timestamp, datetime_format
class KnowledgebaseService(CommonService):
"""Service class for managing knowledge base operations.
@ -108,16 +112,16 @@ class KnowledgebaseService(CommonService):
@classmethod
@DB.connection_context()
def list_documents_by_ids(cls,kb_ids):
def list_documents_by_ids(cls, kb_ids):
# Get document IDs associated with given knowledge base IDs
# Args:
# kb_ids: List of knowledge base IDs
# Returns:
# List of document IDs
doc_ids=cls.model.select(Document.id.alias("document_id")).join(Document,on=(cls.model.id == Document.kb_id)).where(
doc_ids = cls.model.select(Document.id.alias("document_id")).join(Document, on=(cls.model.id == Document.kb_id)).where(
cls.model.id.in_(kb_ids)
)
doc_ids =list(doc_ids.dicts())
doc_ids = list(doc_ids.dicts())
doc_ids = [doc["document_id"] for doc in doc_ids]
return doc_ids
@ -222,7 +226,7 @@ class KnowledgebaseService(CommonService):
cls.model.parser_config,
cls.model.pagerank]
kbs = cls.model.select(*fields).join(Tenant, on=(
(Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
(Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
(cls.model.id == kb_id),
(cls.model.status == StatusEnum.VALID.value)
)
@ -324,7 +328,7 @@ class KnowledgebaseService(CommonService):
kbs = kbs.where(
((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | (
cls.model.tenant_id == user_id))
cls.model.tenant_id == user_id))
& (cls.model.status == StatusEnum.VALID.value)
)
if desc:
@ -347,7 +351,7 @@ class KnowledgebaseService(CommonService):
# Boolean indicating accessibility
docs = cls.model.select(
cls.model.id).join(UserTenant, on=(UserTenant.tenant_id == Knowledgebase.tenant_id)
).where(cls.model.id == kb_id, UserTenant.user_id == user_id).paginate(0, 1)
).where(cls.model.id == kb_id, UserTenant.user_id == user_id).paginate(0, 1)
docs = docs.dicts()
if not docs:
return False
@ -363,7 +367,7 @@ class KnowledgebaseService(CommonService):
# Returns:
# List containing knowledge base information
kbs = cls.model.select().join(UserTenant, on=(UserTenant.tenant_id == Knowledgebase.tenant_id)
).where(cls.model.id == kb_id, UserTenant.user_id == user_id).paginate(0, 1)
).where(cls.model.id == kb_id, UserTenant.user_id == user_id).paginate(0, 1)
kbs = kbs.dicts()
return list(kbs)
@ -377,7 +381,16 @@ class KnowledgebaseService(CommonService):
# Returns:
# List containing knowledge base information
kbs = cls.model.select().join(UserTenant, on=(UserTenant.tenant_id == Knowledgebase.tenant_id)
).where(cls.model.name == kb_name, UserTenant.user_id == user_id).paginate(0, 1)
).where(cls.model.name == kb_name, UserTenant.user_id == user_id).paginate(0, 1)
kbs = kbs.dicts()
return list(kbs)
@classmethod
@DB.connection_context()
def atomic_increase_doc_num_by_id(cls, kb_id):
data = {}
data["update_time"] = current_timestamp()
data["update_date"] = datetime_format(datetime.now())
data["doc_num"] = cls.model.doc_num + 1
num = cls.model.update(data).where(cls.model.id == kb_id).execute()
return num