From 3defd2408734e37b0c3eaa22d40b850154e321dc Mon Sep 17 00:00:00 2001 From: kurokobo Date: Tue, 21 Jan 2025 10:25:40 +0900 Subject: [PATCH] feat: allow updating chunk settings for the existing documents (#12833) --- api/services/dataset_service.py | 32 +++++++++---------- .../knowledge_entities/knowledge_entities.py | 2 +- .../datasets/create/step-two/index.tsx | 2 +- .../documents/detail/completed/index.tsx | 6 ++++ 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index dac0a6a772..c55555451b 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -859,7 +859,7 @@ class DocumentService: position = DocumentService.get_documents_position(dataset.id) document_ids = [] duplicate_document_ids = [] - if knowledge_config.data_source.info_list.data_source_type == "upload_file": + if knowledge_config.data_source.info_list.data_source_type == "upload_file": # type: ignore upload_file_list = knowledge_config.data_source.info_list.file_info_list.file_ids # type: ignore for file_id in upload_file_list: file = ( @@ -901,7 +901,7 @@ class DocumentService: document = DocumentService.build_document( dataset, dataset_process_rule.id, # type: ignore - knowledge_config.data_source.info_list.data_source_type, + knowledge_config.data_source.info_list.data_source_type, # type: ignore knowledge_config.doc_form, knowledge_config.doc_language, data_source_info, @@ -916,8 +916,8 @@ class DocumentService: document_ids.append(document.id) documents.append(document) position += 1 - elif knowledge_config.data_source.info_list.data_source_type == "notion_import": - notion_info_list = knowledge_config.data_source.info_list.notion_info_list + elif knowledge_config.data_source.info_list.data_source_type == "notion_import": # type: ignore + notion_info_list = knowledge_config.data_source.info_list.notion_info_list # type: ignore if not notion_info_list: raise ValueError("No notion info list found.") exist_page_ids = [] @@ -956,7 +956,7 @@ class DocumentService: document = DocumentService.build_document( dataset, dataset_process_rule.id, # type: ignore - knowledge_config.data_source.info_list.data_source_type, + knowledge_config.data_source.info_list.data_source_type, # type: ignore knowledge_config.doc_form, knowledge_config.doc_language, data_source_info, @@ -976,8 +976,8 @@ class DocumentService: # delete not selected documents if len(exist_document) > 0: clean_notion_document_task.delay(list(exist_document.values()), dataset.id) - elif knowledge_config.data_source.info_list.data_source_type == "website_crawl": - website_info = knowledge_config.data_source.info_list.website_info_list + elif knowledge_config.data_source.info_list.data_source_type == "website_crawl": # type: ignore + website_info = knowledge_config.data_source.info_list.website_info_list # type: ignore if not website_info: raise ValueError("No website info list found.") urls = website_info.urls @@ -996,7 +996,7 @@ class DocumentService: document = DocumentService.build_document( dataset, dataset_process_rule.id, # type: ignore - knowledge_config.data_source.info_list.data_source_type, + knowledge_config.data_source.info_list.data_source_type, # type: ignore knowledge_config.doc_form, knowledge_config.doc_language, data_source_info, @@ -1195,20 +1195,20 @@ class DocumentService: if features.billing.enabled: count = 0 - if knowledge_config.data_source.info_list.data_source_type == "upload_file": + if knowledge_config.data_source.info_list.data_source_type == "upload_file": # type: ignore upload_file_list = ( - knowledge_config.data_source.info_list.file_info_list.file_ids - if knowledge_config.data_source.info_list.file_info_list + knowledge_config.data_source.info_list.file_info_list.file_ids # type: ignore + if knowledge_config.data_source.info_list.file_info_list # type: ignore else [] ) count = len(upload_file_list) - elif knowledge_config.data_source.info_list.data_source_type == "notion_import": - notion_info_list = knowledge_config.data_source.info_list.notion_info_list + elif knowledge_config.data_source.info_list.data_source_type == "notion_import": # type: ignore + notion_info_list = knowledge_config.data_source.info_list.notion_info_list # type: ignore if notion_info_list: for notion_info in notion_info_list: count = count + len(notion_info.pages) - elif knowledge_config.data_source.info_list.data_source_type == "website_crawl": - website_info = knowledge_config.data_source.info_list.website_info_list + elif knowledge_config.data_source.info_list.data_source_type == "website_crawl": # type: ignore + website_info = knowledge_config.data_source.info_list.website_info_list # type: ignore if website_info: count = len(website_info.urls) batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT) @@ -1239,7 +1239,7 @@ class DocumentService: dataset = Dataset( tenant_id=tenant_id, name="", - data_source_type=knowledge_config.data_source.info_list.data_source_type, + data_source_type=knowledge_config.data_source.info_list.data_source_type, # type: ignore indexing_technique=knowledge_config.indexing_technique, created_by=account.id, embedding_model=knowledge_config.embedding_model, diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index 76d9c28812..8d6a246b64 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -97,7 +97,7 @@ class KnowledgeConfig(BaseModel): original_document_id: Optional[str] = None duplicate: bool = True indexing_technique: Literal["high_quality", "economy"] - data_source: DataSource + data_source: Optional[DataSource] = None process_rule: Optional[ProcessRule] = None retrieval_model: Optional[RetrievalModel] = None doc_form: str = "text_model" diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 11984d71c6..d6fa45b4fb 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -1001,7 +1001,7 @@ const StepTwo = ({ ) : (
- {!datasetId && } +
)} diff --git a/web/app/components/datasets/documents/detail/completed/index.tsx b/web/app/components/datasets/documents/detail/completed/index.tsx index 69cba1d8cd..d4a4f03578 100644 --- a/web/app/components/datasets/documents/detail/completed/index.tsx +++ b/web/app/components/datasets/documents/detail/completed/index.tsx @@ -4,6 +4,7 @@ import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react' import { useDebounceFn } from 'ahooks' import { useTranslation } from 'react-i18next' import { createContext, useContext, useContextSelector } from 'use-context-selector' +import { usePathname } from 'next/navigation' import { useDocumentContext } from '../index' import { ProcessStatus } from '../segment-add' import s from './style.module.css' @@ -99,6 +100,7 @@ const Completed: FC = ({ }) => { const { t } = useTranslation() const { notify } = useContext(ToastContext) + const pathname = usePathname() const datasetId = useDocumentContext(s => s.datasetId) || '' const documentId = useDocumentContext(s => s.documentId) || '' const docForm = useDocumentContext(s => s.docForm) @@ -374,6 +376,10 @@ const Completed: FC = ({ // eslint-disable-next-line react-hooks/exhaustive-deps }, [segments, datasetId, documentId]) + useEffect(() => { + resetList() + }, [pathname]) + useEffect(() => { if (importStatus === ProcessStatus.COMPLETED) resetList()