mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 03:49:04 +08:00
fix: split chunks return empty strings (#2197)
This commit is contained in:
parent
8639abec97
commit
6cf93379b3
@ -655,7 +655,9 @@ class IndexingRunner:
|
||||
else:
|
||||
page_content = page_content
|
||||
document_node.page_content = page_content
|
||||
split_documents.append(document_node)
|
||||
|
||||
if document_node.page_content:
|
||||
split_documents.append(document_node)
|
||||
all_documents.extend(split_documents)
|
||||
# processing qa document
|
||||
if document_form == 'qa_model':
|
||||
|
@ -1,7 +1,7 @@
|
||||
import base64
|
||||
import copy
|
||||
import time
|
||||
from typing import Optional, Tuple
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
@ -76,7 +76,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
|
||||
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
||||
model=model,
|
||||
client=client,
|
||||
texts=[""],
|
||||
texts="",
|
||||
extra_model_kwargs=extra_model_kwargs
|
||||
)
|
||||
|
||||
@ -147,7 +147,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
|
||||
return ai_model_entity.entity
|
||||
|
||||
@staticmethod
|
||||
def _embedding_invoke(model: str, client: AzureOpenAI, texts: list[str],
|
||||
def _embedding_invoke(model: str, client: AzureOpenAI, texts: Union[list[str], str],
|
||||
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
|
||||
response = client.embeddings.create(
|
||||
input=texts,
|
||||
|
@ -76,7 +76,7 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
|
||||
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
||||
model=model,
|
||||
credentials=credentials,
|
||||
texts=[""]
|
||||
texts=[" "]
|
||||
)
|
||||
|
||||
used_tokens += embedding_used_tokens
|
||||
@ -131,6 +131,9 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
|
||||
:param text: text to tokenize
|
||||
:return:
|
||||
"""
|
||||
if not text:
|
||||
return Tokens([], [], {})
|
||||
|
||||
# initialize client
|
||||
client = cohere.Client(credentials.get('api_key'))
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import base64
|
||||
import time
|
||||
from typing import Optional, Tuple
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
@ -89,7 +89,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
|
||||
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
||||
model=model,
|
||||
client=client,
|
||||
texts=[""],
|
||||
texts="",
|
||||
extra_model_kwargs=extra_model_kwargs
|
||||
)
|
||||
|
||||
@ -160,7 +160,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
|
||||
except Exception as ex:
|
||||
raise CredentialsValidateFailedError(str(ex))
|
||||
|
||||
def _embedding_invoke(self, model: str, client: OpenAI, texts: list[str],
|
||||
def _embedding_invoke(self, model: str, client: OpenAI, texts: Union[list[str], str],
|
||||
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
|
||||
"""
|
||||
Invoke embedding model
|
||||
|
Loading…
x
Reference in New Issue
Block a user