diff --git a/api/core/data_loader/file_extractor.py b/api/core/data_loader/file_extractor.py index aab35b3a77..00f0e607c8 100644 --- a/api/core/data_loader/file_extractor.py +++ b/api/core/data_loader/file_extractor.py @@ -65,7 +65,8 @@ class FileExtractor: elif file_extension == '.pdf': loader = PdfLoader(file_path, upload_file=upload_file) elif file_extension in ['.md', '.markdown']: - loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) + loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) if is_automatic \ + else MarkdownLoader(file_path, autodetect_encoding=True) elif file_extension in ['.htm', '.html']: loader = HTMLLoader(file_path) elif file_extension == '.docx': @@ -84,7 +85,8 @@ class FileExtractor: loader = UnstructuredXmlLoader(file_path, unstructured_api_url) else: # txt - loader = UnstructuredTextLoader(file_path, unstructured_api_url) + loader = UnstructuredTextLoader(file_path, unstructured_api_url) if is_automatic \ + else TextLoader(file_path, autodetect_encoding=True) else: if file_extension == '.xlsx': loader = ExcelLoader(file_path) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index f4ccb7d765..7f1b0a5147 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -59,7 +59,7 @@ class IndexingRunner: first() # load file - text_docs = self._load_data(dataset_document) + text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic') # get splitter splitter = self._get_splitter(processing_rule) @@ -113,15 +113,14 @@ class IndexingRunner: for document_segment in document_segments: db.session.delete(document_segment) db.session.commit() - - # load file - text_docs = self._load_data(dataset_document) - # get the process rule processing_rule = db.session.query(DatasetProcessRule). \ filter(DatasetProcessRule.id == dataset_document.dataset_process_rule_id). \ first() + # load file + text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic') + # get splitter splitter = self._get_splitter(processing_rule) @@ -238,14 +237,15 @@ class IndexingRunner: preview_texts = [] total_segments = 0 for file_detail in file_details: - # load data from file - text_docs = FileExtractor.load(file_detail) processing_rule = DatasetProcessRule( mode=tmp_processing_rule["mode"], rules=json.dumps(tmp_processing_rule["rules"]) ) + # load data from file + text_docs = FileExtractor.load(file_detail, is_automatic=processing_rule.mode == 'automatic') + # get splitter splitter = self._get_splitter(processing_rule) @@ -459,7 +459,7 @@ class IndexingRunner: one_or_none() if file_detail: - text_docs = FileExtractor.load(file_detail, is_automatic=True) + text_docs = FileExtractor.load(file_detail, is_automatic=automatic) elif dataset_document.data_source_type == 'notion_import': loader = NotionLoader.from_document(dataset_document) text_docs = loader.load()