mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 10:48:59 +08:00
fix customer spliter character (#1915)
Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
parent
a798dcfae9
commit
4a3d15b6de
@ -65,7 +65,8 @@ class FileExtractor:
|
|||||||
elif file_extension == '.pdf':
|
elif file_extension == '.pdf':
|
||||||
loader = PdfLoader(file_path, upload_file=upload_file)
|
loader = PdfLoader(file_path, upload_file=upload_file)
|
||||||
elif file_extension in ['.md', '.markdown']:
|
elif file_extension in ['.md', '.markdown']:
|
||||||
loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
|
loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) if is_automatic \
|
||||||
|
else MarkdownLoader(file_path, autodetect_encoding=True)
|
||||||
elif file_extension in ['.htm', '.html']:
|
elif file_extension in ['.htm', '.html']:
|
||||||
loader = HTMLLoader(file_path)
|
loader = HTMLLoader(file_path)
|
||||||
elif file_extension == '.docx':
|
elif file_extension == '.docx':
|
||||||
@ -84,7 +85,8 @@ class FileExtractor:
|
|||||||
loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
|
loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
|
||||||
else:
|
else:
|
||||||
# txt
|
# txt
|
||||||
loader = UnstructuredTextLoader(file_path, unstructured_api_url)
|
loader = UnstructuredTextLoader(file_path, unstructured_api_url) if is_automatic \
|
||||||
|
else TextLoader(file_path, autodetect_encoding=True)
|
||||||
else:
|
else:
|
||||||
if file_extension == '.xlsx':
|
if file_extension == '.xlsx':
|
||||||
loader = ExcelLoader(file_path)
|
loader = ExcelLoader(file_path)
|
||||||
|
@ -59,7 +59,7 @@ class IndexingRunner:
|
|||||||
first()
|
first()
|
||||||
|
|
||||||
# load file
|
# load file
|
||||||
text_docs = self._load_data(dataset_document)
|
text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
|
||||||
|
|
||||||
# get splitter
|
# get splitter
|
||||||
splitter = self._get_splitter(processing_rule)
|
splitter = self._get_splitter(processing_rule)
|
||||||
@ -113,15 +113,14 @@ class IndexingRunner:
|
|||||||
for document_segment in document_segments:
|
for document_segment in document_segments:
|
||||||
db.session.delete(document_segment)
|
db.session.delete(document_segment)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
# load file
|
|
||||||
text_docs = self._load_data(dataset_document)
|
|
||||||
|
|
||||||
# get the process rule
|
# get the process rule
|
||||||
processing_rule = db.session.query(DatasetProcessRule). \
|
processing_rule = db.session.query(DatasetProcessRule). \
|
||||||
filter(DatasetProcessRule.id == dataset_document.dataset_process_rule_id). \
|
filter(DatasetProcessRule.id == dataset_document.dataset_process_rule_id). \
|
||||||
first()
|
first()
|
||||||
|
|
||||||
|
# load file
|
||||||
|
text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
|
||||||
|
|
||||||
# get splitter
|
# get splitter
|
||||||
splitter = self._get_splitter(processing_rule)
|
splitter = self._get_splitter(processing_rule)
|
||||||
|
|
||||||
@ -238,14 +237,15 @@ class IndexingRunner:
|
|||||||
preview_texts = []
|
preview_texts = []
|
||||||
total_segments = 0
|
total_segments = 0
|
||||||
for file_detail in file_details:
|
for file_detail in file_details:
|
||||||
# load data from file
|
|
||||||
text_docs = FileExtractor.load(file_detail)
|
|
||||||
|
|
||||||
processing_rule = DatasetProcessRule(
|
processing_rule = DatasetProcessRule(
|
||||||
mode=tmp_processing_rule["mode"],
|
mode=tmp_processing_rule["mode"],
|
||||||
rules=json.dumps(tmp_processing_rule["rules"])
|
rules=json.dumps(tmp_processing_rule["rules"])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# load data from file
|
||||||
|
text_docs = FileExtractor.load(file_detail, is_automatic=processing_rule.mode == 'automatic')
|
||||||
|
|
||||||
# get splitter
|
# get splitter
|
||||||
splitter = self._get_splitter(processing_rule)
|
splitter = self._get_splitter(processing_rule)
|
||||||
|
|
||||||
@ -459,7 +459,7 @@ class IndexingRunner:
|
|||||||
one_or_none()
|
one_or_none()
|
||||||
|
|
||||||
if file_detail:
|
if file_detail:
|
||||||
text_docs = FileExtractor.load(file_detail, is_automatic=True)
|
text_docs = FileExtractor.load(file_detail, is_automatic=automatic)
|
||||||
elif dataset_document.data_source_type == 'notion_import':
|
elif dataset_document.data_source_type == 'notion_import':
|
||||||
loader = NotionLoader.from_document(dataset_document)
|
loader = NotionLoader.from_document(dataset_document)
|
||||||
text_docs = loader.load()
|
text_docs = loader.load()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user