mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 03:29:01 +08:00
add mutil-thread document embedding (#3016)
Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
parent
20d16d7b31
commit
b0b0cc045f
@ -1,3 +1,4 @@
|
|||||||
|
import concurrent.futures
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@ -650,17 +651,44 @@ class IndexingRunner:
|
|||||||
# chunk nodes by chunk size
|
# chunk nodes by chunk size
|
||||||
indexing_start_at = time.perf_counter()
|
indexing_start_at = time.perf_counter()
|
||||||
tokens = 0
|
tokens = 0
|
||||||
chunk_size = 100
|
chunk_size = 10
|
||||||
|
|
||||||
embedding_model_type_instance = None
|
embedding_model_type_instance = None
|
||||||
if embedding_model_instance:
|
if embedding_model_instance:
|
||||||
embedding_model_type_instance = embedding_model_instance.model_type_instance
|
embedding_model_type_instance = embedding_model_instance.model_type_instance
|
||||||
embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_type_instance)
|
embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_type_instance)
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
|
futures = []
|
||||||
|
for i in range(0, len(documents), chunk_size):
|
||||||
|
chunk_documents = documents[i:i + chunk_size]
|
||||||
|
futures.append(executor.submit(self._process_chunk, current_app._get_current_object(), index_processor,
|
||||||
|
chunk_documents, dataset,
|
||||||
|
dataset_document, embedding_model_instance,
|
||||||
|
embedding_model_type_instance))
|
||||||
|
|
||||||
for i in range(0, len(documents), chunk_size):
|
for future in futures:
|
||||||
|
tokens += future.result()
|
||||||
|
|
||||||
|
indexing_end_at = time.perf_counter()
|
||||||
|
|
||||||
|
# update document status to completed
|
||||||
|
self._update_document_index_status(
|
||||||
|
document_id=dataset_document.id,
|
||||||
|
after_indexing_status="completed",
|
||||||
|
extra_update_params={
|
||||||
|
DatasetDocument.tokens: tokens,
|
||||||
|
DatasetDocument.completed_at: datetime.datetime.utcnow(),
|
||||||
|
DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def _process_chunk(self, flask_app, index_processor, chunk_documents, dataset, dataset_document,
|
||||||
|
embedding_model_instance, embedding_model_type_instance):
|
||||||
|
with flask_app.app_context():
|
||||||
# check document is paused
|
# check document is paused
|
||||||
self._check_document_paused_status(dataset_document.id)
|
self._check_document_paused_status(dataset_document.id)
|
||||||
chunk_documents = documents[i:i + chunk_size]
|
|
||||||
|
tokens = 0
|
||||||
if dataset.indexing_technique == 'high_quality' or embedding_model_type_instance:
|
if dataset.indexing_technique == 'high_quality' or embedding_model_type_instance:
|
||||||
tokens += sum(
|
tokens += sum(
|
||||||
embedding_model_type_instance.get_num_tokens(
|
embedding_model_type_instance.get_num_tokens(
|
||||||
@ -670,9 +698,9 @@ class IndexingRunner:
|
|||||||
)
|
)
|
||||||
for document in chunk_documents
|
for document in chunk_documents
|
||||||
)
|
)
|
||||||
|
|
||||||
# load index
|
# load index
|
||||||
index_processor.load(dataset, chunk_documents)
|
index_processor.load(dataset, chunk_documents)
|
||||||
db.session.add(dataset)
|
|
||||||
|
|
||||||
document_ids = [document.metadata['doc_id'] for document in chunk_documents]
|
document_ids = [document.metadata['doc_id'] for document in chunk_documents]
|
||||||
db.session.query(DocumentSegment).filter(
|
db.session.query(DocumentSegment).filter(
|
||||||
@ -687,18 +715,7 @@ class IndexingRunner:
|
|||||||
|
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
indexing_end_at = time.perf_counter()
|
return tokens
|
||||||
|
|
||||||
# update document status to completed
|
|
||||||
self._update_document_index_status(
|
|
||||||
document_id=dataset_document.id,
|
|
||||||
after_indexing_status="completed",
|
|
||||||
extra_update_params={
|
|
||||||
DatasetDocument.tokens: tokens,
|
|
||||||
DatasetDocument.completed_at: datetime.datetime.utcnow(),
|
|
||||||
DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
def _check_document_paused_status(self, document_id: str):
|
def _check_document_paused_status(self, document_id: str):
|
||||||
indexing_cache_key = 'document_{}_is_paused'.format(document_id)
|
indexing_cache_key = 'document_{}_is_paused'.format(document_id)
|
||||||
|
@ -53,7 +53,7 @@ class UnstructuredWordExtractor(BaseExtractor):
|
|||||||
elements = partition_docx(filename=self._file_path)
|
elements = partition_docx(filename=self._file_path)
|
||||||
|
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
text = chunk.text.strip()
|
text = chunk.text.strip()
|
||||||
|
@ -43,7 +43,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
text = chunk.text.strip()
|
text = chunk.text.strip()
|
||||||
|
@ -38,7 +38,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
|
|||||||
|
|
||||||
elements = partition_md(filename=self._file_path, api_url=self._api_url)
|
elements = partition_md(filename=self._file_path, api_url=self._api_url)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
text = chunk.text.strip()
|
text = chunk.text.strip()
|
||||||
|
@ -28,7 +28,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
|
|||||||
|
|
||||||
elements = partition_msg(filename=self._file_path, api_url=self._api_url)
|
elements = partition_msg(filename=self._file_path, api_url=self._api_url)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
text = chunk.text.strip()
|
text = chunk.text.strip()
|
||||||
|
@ -28,7 +28,7 @@ class UnstructuredTextExtractor(BaseExtractor):
|
|||||||
|
|
||||||
elements = partition_text(filename=self._file_path, api_url=self._api_url)
|
elements = partition_text(filename=self._file_path, api_url=self._api_url)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
text = chunk.text.strip()
|
text = chunk.text.strip()
|
||||||
|
@ -28,7 +28,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
|
|||||||
|
|
||||||
elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
|
elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
text = chunk.text.strip()
|
text = chunk.text.strip()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user