diff --git a/api/controllers/console/datasets/data_source.py b/api/controllers/console/datasets/data_source.py index c0c345baea..f3e639c6ac 100644 --- a/api/controllers/console/datasets/data_source.py +++ b/api/controllers/console/datasets/data_source.py @@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource): notion_workspace_id=workspace_id, notion_obj_id=page_id, notion_page_type=page_type, - notion_access_token=data_source_binding.access_token + notion_access_token=data_source_binding.access_token, + tenant_id=current_user.current_tenant_id ) text_docs = extractor.extract() @@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource): notion_info={ "notion_workspace_id": workspace_id, "notion_obj_id": page['page_id'], - "notion_page_type": page['type'] + "notion_page_type": page['type'], + "tenant_id": current_user.current_tenant_id }, document_model=args['doc_form'] ) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index f80b4de48d..e633631c42 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource): notion_info={ "notion_workspace_id": workspace_id, "notion_obj_id": page['page_id'], - "notion_page_type": page['type'] + "notion_page_type": page['type'], + "tenant_id": current_user.current_tenant_id }, document_model=args['doc_form'] ) diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py index a990ef96ee..c383cdc762 100644 --- a/api/controllers/console/datasets/datasets_document.py +++ b/api/controllers/console/datasets/datasets_document.py @@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource): notion_info={ "notion_workspace_id": data_source_info['notion_workspace_id'], "notion_obj_id": data_source_info['notion_page_id'], - "notion_page_type": data_source_info['type'] + "notion_page_type": data_source_info['type'], + "tenant_id": current_user.current_tenant_id }, document_model=document.doc_form ) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 68bb294a18..f5ea49bb5e 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -366,7 +366,8 @@ class IndexingRunner: "notion_workspace_id": data_source_info['notion_workspace_id'], "notion_obj_id": data_source_info['notion_page_id'], "notion_page_type": data_source_info['type'], - "document": dataset_document + "document": dataset_document, + "tenant_id": dataset_document.tenant_id }, document_model=dataset_document.doc_form ) diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py index c0205d1aa9..e295e58950 100644 --- a/api/core/rag/datasource/retrieval_service.py +++ b/api/core/rag/datasource/retrieval_service.py @@ -39,7 +39,8 @@ class RetrievalService: 'flask_app': current_app._get_current_object(), 'dataset_id': dataset_id, 'query': query, - 'top_k': top_k + 'top_k': top_k, + 'all_documents': all_documents }) threads.append(keyword_thread) keyword_thread.start() diff --git a/api/core/rag/extractor/entity/extract_setting.py b/api/core/rag/extractor/entity/extract_setting.py index bc5310f7be..49cd4d0c03 100644 --- a/api/core/rag/extractor/entity/extract_setting.py +++ b/api/core/rag/extractor/entity/extract_setting.py @@ -12,6 +12,7 @@ class NotionInfo(BaseModel): notion_obj_id: str notion_page_type: str document: Document = None + tenant_id: str class Config: arbitrary_types_allowed = True diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 7c7dc5bdae..0de7065335 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -132,7 +132,8 @@ class ExtractProcessor: notion_workspace_id=extract_setting.notion_info.notion_workspace_id, notion_obj_id=extract_setting.notion_info.notion_obj_id, notion_page_type=extract_setting.notion_info.notion_page_type, - document_model=extract_setting.notion_info.document + document_model=extract_setting.notion_info.document, + tenant_id=extract_setting.notion_info.tenant_id, ) return extractor.extract() else: diff --git a/api/core/rag/extractor/html_extractor.py b/api/core/rag/extractor/html_extractor.py index 557ea42b19..ceb5306255 100644 --- a/api/core/rag/extractor/html_extractor.py +++ b/api/core/rag/extractor/html_extractor.py @@ -1,13 +1,14 @@ """Abstract interface for document loader implementations.""" -from typing import Optional +from bs4 import BeautifulSoup from core.rag.extractor.extractor_base import BaseExtractor -from core.rag.extractor.helpers import detect_file_encodings from core.rag.models.document import Document class HtmlExtractor(BaseExtractor): - """Load html files. + + """ + Load html files. Args: @@ -15,57 +16,19 @@ class HtmlExtractor(BaseExtractor): """ def __init__( - self, - file_path: str, - encoding: Optional[str] = None, - autodetect_encoding: bool = False, - source_column: Optional[str] = None, - csv_args: Optional[dict] = None, + self, + file_path: str ): """Initialize with file path.""" self._file_path = file_path - self._encoding = encoding - self._autodetect_encoding = autodetect_encoding - self.source_column = source_column - self.csv_args = csv_args or {} def extract(self) -> list[Document]: - """Load data into document objects.""" - try: - with open(self._file_path, newline="", encoding=self._encoding) as csvfile: - docs = self._read_from_file(csvfile) - except UnicodeDecodeError as e: - if self._autodetect_encoding: - detected_encodings = detect_file_encodings(self._file_path) - for encoding in detected_encodings: - try: - with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile: - docs = self._read_from_file(csvfile) - break - except UnicodeDecodeError: - continue - else: - raise RuntimeError(f"Error loading {self._file_path}") from e + return [Document(page_content=self._load_as_text())] - return docs + def _load_as_text(self) -> str: + with open(self._file_path, "rb") as fp: + soup = BeautifulSoup(fp, 'html.parser') + text = soup.get_text() + text = text.strip() if text else '' - def _read_from_file(self, csvfile) -> list[Document]: - docs = [] - csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore - for i, row in enumerate(csv_reader): - content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items()) - try: - source = ( - row[self.source_column] - if self.source_column is not None - else '' - ) - except KeyError: - raise ValueError( - f"Source column '{self.source_column}' not found in CSV file." - ) - metadata = {"source": source, "row": i} - doc = Document(page_content=content, metadata=metadata) - docs.append(doc) - - return docs + return text \ No newline at end of file diff --git a/api/core/rag/extractor/notion_extractor.py b/api/core/rag/extractor/notion_extractor.py index f28436ffd9..38dd36361a 100644 --- a/api/core/rag/extractor/notion_extractor.py +++ b/api/core/rag/extractor/notion_extractor.py @@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor): notion_workspace_id: str, notion_obj_id: str, notion_page_type: str, + tenant_id: str, document_model: Optional[DocumentModel] = None, - notion_access_token: Optional[str] = None + notion_access_token: Optional[str] = None, + ): self._notion_access_token = None self._document_model = document_model diff --git a/api/tasks/document_indexing_sync_task.py b/api/tasks/document_indexing_sync_task.py index 84e2029705..a646158dbd 100644 --- a/api/tasks/document_indexing_sync_task.py +++ b/api/tasks/document_indexing_sync_task.py @@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str): notion_workspace_id=workspace_id, notion_obj_id=page_id, notion_page_type=page_type, - notion_access_token=data_source_binding.access_token + notion_access_token=data_source_binding.access_token, + tenant_id=document.tenant_id ) last_edited_time = loader.get_notion_last_edited_time()