mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 08:28:57 +08:00
add document lock for multi-thread (#9873)
This commit is contained in:
parent
9633c5dab6
commit
af68084895
@ -760,166 +760,168 @@ class DocumentService:
|
|||||||
)
|
)
|
||||||
db.session.add(dataset_process_rule)
|
db.session.add(dataset_process_rule)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
position = DocumentService.get_documents_position(dataset.id)
|
lock_name = "add_document_lock_dataset_id_{}".format(dataset.id)
|
||||||
document_ids = []
|
with redis_client.lock(lock_name, timeout=600):
|
||||||
duplicate_document_ids = []
|
position = DocumentService.get_documents_position(dataset.id)
|
||||||
if document_data["data_source"]["type"] == "upload_file":
|
document_ids = []
|
||||||
upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"]
|
duplicate_document_ids = []
|
||||||
for file_id in upload_file_list:
|
if document_data["data_source"]["type"] == "upload_file":
|
||||||
file = (
|
upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"]
|
||||||
db.session.query(UploadFile)
|
for file_id in upload_file_list:
|
||||||
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
file = (
|
||||||
.first()
|
db.session.query(UploadFile)
|
||||||
)
|
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
||||||
|
.first()
|
||||||
# raise error if file not found
|
|
||||||
if not file:
|
|
||||||
raise FileNotExistsError()
|
|
||||||
|
|
||||||
file_name = file.name
|
|
||||||
data_source_info = {
|
|
||||||
"upload_file_id": file_id,
|
|
||||||
}
|
|
||||||
# check duplicate
|
|
||||||
if document_data.get("duplicate", False):
|
|
||||||
document = Document.query.filter_by(
|
|
||||||
dataset_id=dataset.id,
|
|
||||||
tenant_id=current_user.current_tenant_id,
|
|
||||||
data_source_type="upload_file",
|
|
||||||
enabled=True,
|
|
||||||
name=file_name,
|
|
||||||
).first()
|
|
||||||
if document:
|
|
||||||
document.dataset_process_rule_id = dataset_process_rule.id
|
|
||||||
document.updated_at = datetime.datetime.utcnow()
|
|
||||||
document.created_from = created_from
|
|
||||||
document.doc_form = document_data["doc_form"]
|
|
||||||
document.doc_language = document_data["doc_language"]
|
|
||||||
document.data_source_info = json.dumps(data_source_info)
|
|
||||||
document.batch = batch
|
|
||||||
document.indexing_status = "waiting"
|
|
||||||
db.session.add(document)
|
|
||||||
documents.append(document)
|
|
||||||
duplicate_document_ids.append(document.id)
|
|
||||||
continue
|
|
||||||
document = DocumentService.build_document(
|
|
||||||
dataset,
|
|
||||||
dataset_process_rule.id,
|
|
||||||
document_data["data_source"]["type"],
|
|
||||||
document_data["doc_form"],
|
|
||||||
document_data["doc_language"],
|
|
||||||
data_source_info,
|
|
||||||
created_from,
|
|
||||||
position,
|
|
||||||
account,
|
|
||||||
file_name,
|
|
||||||
batch,
|
|
||||||
)
|
|
||||||
db.session.add(document)
|
|
||||||
db.session.flush()
|
|
||||||
document_ids.append(document.id)
|
|
||||||
documents.append(document)
|
|
||||||
position += 1
|
|
||||||
elif document_data["data_source"]["type"] == "notion_import":
|
|
||||||
notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"]
|
|
||||||
exist_page_ids = []
|
|
||||||
exist_document = {}
|
|
||||||
documents = Document.query.filter_by(
|
|
||||||
dataset_id=dataset.id,
|
|
||||||
tenant_id=current_user.current_tenant_id,
|
|
||||||
data_source_type="notion_import",
|
|
||||||
enabled=True,
|
|
||||||
).all()
|
|
||||||
if documents:
|
|
||||||
for document in documents:
|
|
||||||
data_source_info = json.loads(document.data_source_info)
|
|
||||||
exist_page_ids.append(data_source_info["notion_page_id"])
|
|
||||||
exist_document[data_source_info["notion_page_id"]] = document.id
|
|
||||||
for notion_info in notion_info_list:
|
|
||||||
workspace_id = notion_info["workspace_id"]
|
|
||||||
data_source_binding = DataSourceOauthBinding.query.filter(
|
|
||||||
db.and_(
|
|
||||||
DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
|
|
||||||
DataSourceOauthBinding.provider == "notion",
|
|
||||||
DataSourceOauthBinding.disabled == False,
|
|
||||||
DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
|
|
||||||
)
|
)
|
||||||
).first()
|
|
||||||
if not data_source_binding:
|
# raise error if file not found
|
||||||
raise ValueError("Data source binding not found.")
|
if not file:
|
||||||
for page in notion_info["pages"]:
|
raise FileNotExistsError()
|
||||||
if page["page_id"] not in exist_page_ids:
|
|
||||||
data_source_info = {
|
file_name = file.name
|
||||||
"notion_workspace_id": workspace_id,
|
data_source_info = {
|
||||||
"notion_page_id": page["page_id"],
|
"upload_file_id": file_id,
|
||||||
"notion_page_icon": page["page_icon"],
|
}
|
||||||
"type": page["type"],
|
# check duplicate
|
||||||
}
|
if document_data.get("duplicate", False):
|
||||||
document = DocumentService.build_document(
|
document = Document.query.filter_by(
|
||||||
dataset,
|
dataset_id=dataset.id,
|
||||||
dataset_process_rule.id,
|
tenant_id=current_user.current_tenant_id,
|
||||||
document_data["data_source"]["type"],
|
data_source_type="upload_file",
|
||||||
document_data["doc_form"],
|
enabled=True,
|
||||||
document_data["doc_language"],
|
name=file_name,
|
||||||
data_source_info,
|
).first()
|
||||||
created_from,
|
if document:
|
||||||
position,
|
document.dataset_process_rule_id = dataset_process_rule.id
|
||||||
account,
|
document.updated_at = datetime.datetime.utcnow()
|
||||||
page["page_name"],
|
document.created_from = created_from
|
||||||
batch,
|
document.doc_form = document_data["doc_form"]
|
||||||
|
document.doc_language = document_data["doc_language"]
|
||||||
|
document.data_source_info = json.dumps(data_source_info)
|
||||||
|
document.batch = batch
|
||||||
|
document.indexing_status = "waiting"
|
||||||
|
db.session.add(document)
|
||||||
|
documents.append(document)
|
||||||
|
duplicate_document_ids.append(document.id)
|
||||||
|
continue
|
||||||
|
document = DocumentService.build_document(
|
||||||
|
dataset,
|
||||||
|
dataset_process_rule.id,
|
||||||
|
document_data["data_source"]["type"],
|
||||||
|
document_data["doc_form"],
|
||||||
|
document_data["doc_language"],
|
||||||
|
data_source_info,
|
||||||
|
created_from,
|
||||||
|
position,
|
||||||
|
account,
|
||||||
|
file_name,
|
||||||
|
batch,
|
||||||
|
)
|
||||||
|
db.session.add(document)
|
||||||
|
db.session.flush()
|
||||||
|
document_ids.append(document.id)
|
||||||
|
documents.append(document)
|
||||||
|
position += 1
|
||||||
|
elif document_data["data_source"]["type"] == "notion_import":
|
||||||
|
notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"]
|
||||||
|
exist_page_ids = []
|
||||||
|
exist_document = {}
|
||||||
|
documents = Document.query.filter_by(
|
||||||
|
dataset_id=dataset.id,
|
||||||
|
tenant_id=current_user.current_tenant_id,
|
||||||
|
data_source_type="notion_import",
|
||||||
|
enabled=True,
|
||||||
|
).all()
|
||||||
|
if documents:
|
||||||
|
for document in documents:
|
||||||
|
data_source_info = json.loads(document.data_source_info)
|
||||||
|
exist_page_ids.append(data_source_info["notion_page_id"])
|
||||||
|
exist_document[data_source_info["notion_page_id"]] = document.id
|
||||||
|
for notion_info in notion_info_list:
|
||||||
|
workspace_id = notion_info["workspace_id"]
|
||||||
|
data_source_binding = DataSourceOauthBinding.query.filter(
|
||||||
|
db.and_(
|
||||||
|
DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
|
||||||
|
DataSourceOauthBinding.provider == "notion",
|
||||||
|
DataSourceOauthBinding.disabled == False,
|
||||||
|
DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
|
||||||
)
|
)
|
||||||
db.session.add(document)
|
).first()
|
||||||
db.session.flush()
|
if not data_source_binding:
|
||||||
document_ids.append(document.id)
|
raise ValueError("Data source binding not found.")
|
||||||
documents.append(document)
|
for page in notion_info["pages"]:
|
||||||
position += 1
|
if page["page_id"] not in exist_page_ids:
|
||||||
|
data_source_info = {
|
||||||
|
"notion_workspace_id": workspace_id,
|
||||||
|
"notion_page_id": page["page_id"],
|
||||||
|
"notion_page_icon": page["page_icon"],
|
||||||
|
"type": page["type"],
|
||||||
|
}
|
||||||
|
document = DocumentService.build_document(
|
||||||
|
dataset,
|
||||||
|
dataset_process_rule.id,
|
||||||
|
document_data["data_source"]["type"],
|
||||||
|
document_data["doc_form"],
|
||||||
|
document_data["doc_language"],
|
||||||
|
data_source_info,
|
||||||
|
created_from,
|
||||||
|
position,
|
||||||
|
account,
|
||||||
|
page["page_name"],
|
||||||
|
batch,
|
||||||
|
)
|
||||||
|
db.session.add(document)
|
||||||
|
db.session.flush()
|
||||||
|
document_ids.append(document.id)
|
||||||
|
documents.append(document)
|
||||||
|
position += 1
|
||||||
|
else:
|
||||||
|
exist_document.pop(page["page_id"])
|
||||||
|
# delete not selected documents
|
||||||
|
if len(exist_document) > 0:
|
||||||
|
clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
|
||||||
|
elif document_data["data_source"]["type"] == "website_crawl":
|
||||||
|
website_info = document_data["data_source"]["info_list"]["website_info_list"]
|
||||||
|
urls = website_info["urls"]
|
||||||
|
for url in urls:
|
||||||
|
data_source_info = {
|
||||||
|
"url": url,
|
||||||
|
"provider": website_info["provider"],
|
||||||
|
"job_id": website_info["job_id"],
|
||||||
|
"only_main_content": website_info.get("only_main_content", False),
|
||||||
|
"mode": "crawl",
|
||||||
|
}
|
||||||
|
if len(url) > 255:
|
||||||
|
document_name = url[:200] + "..."
|
||||||
else:
|
else:
|
||||||
exist_document.pop(page["page_id"])
|
document_name = url
|
||||||
# delete not selected documents
|
document = DocumentService.build_document(
|
||||||
if len(exist_document) > 0:
|
dataset,
|
||||||
clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
|
dataset_process_rule.id,
|
||||||
elif document_data["data_source"]["type"] == "website_crawl":
|
document_data["data_source"]["type"],
|
||||||
website_info = document_data["data_source"]["info_list"]["website_info_list"]
|
document_data["doc_form"],
|
||||||
urls = website_info["urls"]
|
document_data["doc_language"],
|
||||||
for url in urls:
|
data_source_info,
|
||||||
data_source_info = {
|
created_from,
|
||||||
"url": url,
|
position,
|
||||||
"provider": website_info["provider"],
|
account,
|
||||||
"job_id": website_info["job_id"],
|
document_name,
|
||||||
"only_main_content": website_info.get("only_main_content", False),
|
batch,
|
||||||
"mode": "crawl",
|
)
|
||||||
}
|
db.session.add(document)
|
||||||
if len(url) > 255:
|
db.session.flush()
|
||||||
document_name = url[:200] + "..."
|
document_ids.append(document.id)
|
||||||
else:
|
documents.append(document)
|
||||||
document_name = url
|
position += 1
|
||||||
document = DocumentService.build_document(
|
db.session.commit()
|
||||||
dataset,
|
|
||||||
dataset_process_rule.id,
|
|
||||||
document_data["data_source"]["type"],
|
|
||||||
document_data["doc_form"],
|
|
||||||
document_data["doc_language"],
|
|
||||||
data_source_info,
|
|
||||||
created_from,
|
|
||||||
position,
|
|
||||||
account,
|
|
||||||
document_name,
|
|
||||||
batch,
|
|
||||||
)
|
|
||||||
db.session.add(document)
|
|
||||||
db.session.flush()
|
|
||||||
document_ids.append(document.id)
|
|
||||||
documents.append(document)
|
|
||||||
position += 1
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
# trigger async task
|
# trigger async task
|
||||||
if document_ids:
|
if document_ids:
|
||||||
document_indexing_task.delay(dataset.id, document_ids)
|
document_indexing_task.delay(dataset.id, document_ids)
|
||||||
if duplicate_document_ids:
|
if duplicate_document_ids:
|
||||||
duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
|
duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
|
||||||
|
|
||||||
return documents, batch
|
return documents, batch
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def check_documents_upload_quota(count: int, features: FeatureModel):
|
def check_documents_upload_quota(count: int, features: FeatureModel):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user