diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 4fbc6289fb..836cf3e671 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -655,7 +655,9 @@ class IndexingRunner: else: page_content = page_content document_node.page_content = page_content - split_documents.append(document_node) + + if document_node.page_content: + split_documents.append(document_node) all_documents.extend(split_documents) # processing qa document if document_form == 'qa_model': diff --git a/api/core/model_runtime/model_providers/azure_openai/text_embedding/text_embedding.py b/api/core/model_runtime/model_providers/azure_openai/text_embedding/text_embedding.py index 227cd64fba..e472151cb5 100644 --- a/api/core/model_runtime/model_providers/azure_openai/text_embedding/text_embedding.py +++ b/api/core/model_runtime/model_providers/azure_openai/text_embedding/text_embedding.py @@ -1,7 +1,7 @@ import base64 import copy import time -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import numpy as np import tiktoken @@ -76,7 +76,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel): embeddings_batch, embedding_used_tokens = self._embedding_invoke( model=model, client=client, - texts=[""], + texts="", extra_model_kwargs=extra_model_kwargs ) @@ -147,7 +147,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel): return ai_model_entity.entity @staticmethod - def _embedding_invoke(model: str, client: AzureOpenAI, texts: list[str], + def _embedding_invoke(model: str, client: AzureOpenAI, texts: Union[list[str], str], extra_model_kwargs: dict) -> Tuple[list[list[float]], int]: response = client.embeddings.create( input=texts, diff --git a/api/core/model_runtime/model_providers/cohere/text_embedding/text_embedding.py b/api/core/model_runtime/model_providers/cohere/text_embedding/text_embedding.py index d824ed0b3d..a1c73ab6a0 100644 --- a/api/core/model_runtime/model_providers/cohere/text_embedding/text_embedding.py +++ b/api/core/model_runtime/model_providers/cohere/text_embedding/text_embedding.py @@ -76,7 +76,7 @@ class CohereTextEmbeddingModel(TextEmbeddingModel): embeddings_batch, embedding_used_tokens = self._embedding_invoke( model=model, credentials=credentials, - texts=[""] + texts=[" "] ) used_tokens += embedding_used_tokens @@ -131,6 +131,9 @@ class CohereTextEmbeddingModel(TextEmbeddingModel): :param text: text to tokenize :return: """ + if not text: + return Tokens([], [], {}) + # initialize client client = cohere.Client(credentials.get('api_key')) diff --git a/api/core/model_runtime/model_providers/openai/text_embedding/text_embedding.py b/api/core/model_runtime/model_providers/openai/text_embedding/text_embedding.py index cde354e861..87a5cf1a2a 100644 --- a/api/core/model_runtime/model_providers/openai/text_embedding/text_embedding.py +++ b/api/core/model_runtime/model_providers/openai/text_embedding/text_embedding.py @@ -1,6 +1,6 @@ import base64 import time -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import numpy as np import tiktoken @@ -89,7 +89,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel): embeddings_batch, embedding_used_tokens = self._embedding_invoke( model=model, client=client, - texts=[""], + texts="", extra_model_kwargs=extra_model_kwargs ) @@ -160,7 +160,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel): except Exception as ex: raise CredentialsValidateFailedError(str(ex)) - def _embedding_invoke(self, model: str, client: OpenAI, texts: list[str], + def _embedding_invoke(self, model: str, client: OpenAI, texts: Union[list[str], str], extra_model_kwargs: dict) -> Tuple[list[list[float]], int]: """ Invoke embedding model