Feat/add thread control (#675)

2025-08-11 01:08:57 +08:00 · 2023-07-29 17:00:21 +08:00 · 2023-07-29 17:00:21 +08:00 · 9eaae770a6
commit 9eaae770a6
parent ca60610306
1 changed files with 13 additions and 19 deletions
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@ -1,4 +1,3 @@
 import asyncio
 import concurrent
 import datetime
 import json
@ -8,25 +7,17 @@ import threading
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from multiprocessing import Process
 from typing import Optional, List, cast
 import openai
 from billiard.pool import Pool
 from flask import current_app, Flask
 from flask_login import current_user
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
 from core.data_loader.file_extractor import FileExtractor
 from core.data_loader.loader.notion import NotionLoader
 from core.docstore.dataset_docstore import DatesetDocumentStore
 from core.embedding.cached_embedding import CacheEmbedding
 from core.generator.llm_generator import LLMGenerator
 from core.index.index import IndexBuilder
 from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
 from core.index.vector_index.vector_index import VectorIndex
 from core.llm.error import ProviderTokenNotInitError
 from core.llm.llm_builder import LLMBuilder
 from core.llm.streamable_open_ai import StreamableOpenAI
@ -516,20 +507,23 @@ class IndexingRunner:
                model_name='gpt-3.5-turbo',
                max_tokens=2000
            )
-            threads = []
+            for i in range(0, len(documents), 10):
-            for doc in documents:
+                threads = []
-                document_format_thread = threading.Thread(target=self.format_document, kwargs={
+                sub_documents = documents[i:i + 10]
-                    'llm': llm, 'document_node': doc, 'split_documents': split_documents, 'document_form': document_form})
+                for doc in sub_documents:
-                threads.append(document_format_thread)
+                    document_format_thread = threading.Thread(target=self.format_document, kwargs={
-                document_format_thread.start()
+                        'llm': llm, 'document_node': doc, 'split_documents': split_documents,
-            for thread in threads:
+                        'document_form': document_form})
-                thread.join()
+                    threads.append(document_format_thread)
                    document_format_thread.start()
                for thread in threads:
                    thread.join()
            all_documents.extend(split_documents)
        return all_documents
-    def format_document(self, llm: StreamableOpenAI, document_node, split_documents: List, document_form: str):
+    def format_document(self, llm: StreamableOpenAI, document_node, split_documents, document_form: str):
        print(document_node.page_content)
        format_documents = []
        if document_node.page_content is None or not document_node.page_content.strip():
            return format_documents