mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 01:08:57 +08:00
Feat/add thread control (#675)
This commit is contained in:
parent
ca60610306
commit
9eaae770a6
@ -1,4 +1,3 @@
|
|||||||
import asyncio
|
|
||||||
import concurrent
|
import concurrent
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
@ -8,25 +7,17 @@ import threading
|
|||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from multiprocessing import Process
|
|
||||||
from typing import Optional, List, cast
|
from typing import Optional, List, cast
|
||||||
|
|
||||||
import openai
|
|
||||||
from billiard.pool import Pool
|
|
||||||
from flask import current_app, Flask
|
|
||||||
from flask_login import current_user
|
from flask_login import current_user
|
||||||
from langchain.embeddings import OpenAIEmbeddings
|
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||||
|
|
||||||
from core.data_loader.file_extractor import FileExtractor
|
from core.data_loader.file_extractor import FileExtractor
|
||||||
from core.data_loader.loader.notion import NotionLoader
|
from core.data_loader.loader.notion import NotionLoader
|
||||||
from core.docstore.dataset_docstore import DatesetDocumentStore
|
from core.docstore.dataset_docstore import DatesetDocumentStore
|
||||||
from core.embedding.cached_embedding import CacheEmbedding
|
|
||||||
from core.generator.llm_generator import LLMGenerator
|
from core.generator.llm_generator import LLMGenerator
|
||||||
from core.index.index import IndexBuilder
|
from core.index.index import IndexBuilder
|
||||||
from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
|
|
||||||
from core.index.vector_index.vector_index import VectorIndex
|
|
||||||
from core.llm.error import ProviderTokenNotInitError
|
from core.llm.error import ProviderTokenNotInitError
|
||||||
from core.llm.llm_builder import LLMBuilder
|
from core.llm.llm_builder import LLMBuilder
|
||||||
from core.llm.streamable_open_ai import StreamableOpenAI
|
from core.llm.streamable_open_ai import StreamableOpenAI
|
||||||
@ -516,20 +507,23 @@ class IndexingRunner:
|
|||||||
model_name='gpt-3.5-turbo',
|
model_name='gpt-3.5-turbo',
|
||||||
max_tokens=2000
|
max_tokens=2000
|
||||||
)
|
)
|
||||||
threads = []
|
for i in range(0, len(documents), 10):
|
||||||
for doc in documents:
|
threads = []
|
||||||
document_format_thread = threading.Thread(target=self.format_document, kwargs={
|
sub_documents = documents[i:i + 10]
|
||||||
'llm': llm, 'document_node': doc, 'split_documents': split_documents, 'document_form': document_form})
|
for doc in sub_documents:
|
||||||
threads.append(document_format_thread)
|
document_format_thread = threading.Thread(target=self.format_document, kwargs={
|
||||||
document_format_thread.start()
|
'llm': llm, 'document_node': doc, 'split_documents': split_documents,
|
||||||
for thread in threads:
|
'document_form': document_form})
|
||||||
thread.join()
|
threads.append(document_format_thread)
|
||||||
|
document_format_thread.start()
|
||||||
|
for thread in threads:
|
||||||
|
thread.join()
|
||||||
|
|
||||||
all_documents.extend(split_documents)
|
all_documents.extend(split_documents)
|
||||||
|
|
||||||
return all_documents
|
return all_documents
|
||||||
|
|
||||||
def format_document(self, llm: StreamableOpenAI, document_node, split_documents: List, document_form: str):
|
def format_document(self, llm: StreamableOpenAI, document_node, split_documents, document_form: str):
|
||||||
print(document_node.page_content)
|
|
||||||
format_documents = []
|
format_documents = []
|
||||||
if document_node.page_content is None or not document_node.page_content.strip():
|
if document_node.page_content is None or not document_node.page_content.strip():
|
||||||
return format_documents
|
return format_documents
|
||||||
|
Loading…
x
Reference in New Issue
Block a user