fix: split chunks return empty strings (#2197)

This commit is contained in:
takatost 2024-01-25 13:59:18 +08:00 committed by GitHub
parent 8639abec97
commit 6cf93379b3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 13 additions and 8 deletions

View File

@ -655,7 +655,9 @@ class IndexingRunner:
else: else:
page_content = page_content page_content = page_content
document_node.page_content = page_content document_node.page_content = page_content
split_documents.append(document_node)
if document_node.page_content:
split_documents.append(document_node)
all_documents.extend(split_documents) all_documents.extend(split_documents)
# processing qa document # processing qa document
if document_form == 'qa_model': if document_form == 'qa_model':

View File

@ -1,7 +1,7 @@
import base64 import base64
import copy import copy
import time import time
from typing import Optional, Tuple from typing import Optional, Tuple, Union
import numpy as np import numpy as np
import tiktoken import tiktoken
@ -76,7 +76,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
client=client, client=client,
texts=[""], texts="",
extra_model_kwargs=extra_model_kwargs extra_model_kwargs=extra_model_kwargs
) )
@ -147,7 +147,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
return ai_model_entity.entity return ai_model_entity.entity
@staticmethod @staticmethod
def _embedding_invoke(model: str, client: AzureOpenAI, texts: list[str], def _embedding_invoke(model: str, client: AzureOpenAI, texts: Union[list[str], str],
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]: extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
response = client.embeddings.create( response = client.embeddings.create(
input=texts, input=texts,

View File

@ -76,7 +76,7 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
credentials=credentials, credentials=credentials,
texts=[""] texts=[" "]
) )
used_tokens += embedding_used_tokens used_tokens += embedding_used_tokens
@ -131,6 +131,9 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
:param text: text to tokenize :param text: text to tokenize
:return: :return:
""" """
if not text:
return Tokens([], [], {})
# initialize client # initialize client
client = cohere.Client(credentials.get('api_key')) client = cohere.Client(credentials.get('api_key'))

View File

@ -1,6 +1,6 @@
import base64 import base64
import time import time
from typing import Optional, Tuple from typing import Optional, Tuple, Union
import numpy as np import numpy as np
import tiktoken import tiktoken
@ -89,7 +89,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
client=client, client=client,
texts=[""], texts="",
extra_model_kwargs=extra_model_kwargs extra_model_kwargs=extra_model_kwargs
) )
@ -160,7 +160,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
except Exception as ex: except Exception as ex:
raise CredentialsValidateFailedError(str(ex)) raise CredentialsValidateFailedError(str(ex))
def _embedding_invoke(self, model: str, client: OpenAI, texts: list[str], def _embedding_invoke(self, model: str, client: OpenAI, texts: Union[list[str], str],
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]: extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
""" """
Invoke embedding model Invoke embedding model