mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 20:39:01 +08:00
fix: split chunks return empty strings (#2197)
This commit is contained in:
parent
8639abec97
commit
6cf93379b3
@ -655,7 +655,9 @@ class IndexingRunner:
|
|||||||
else:
|
else:
|
||||||
page_content = page_content
|
page_content = page_content
|
||||||
document_node.page_content = page_content
|
document_node.page_content = page_content
|
||||||
split_documents.append(document_node)
|
|
||||||
|
if document_node.page_content:
|
||||||
|
split_documents.append(document_node)
|
||||||
all_documents.extend(split_documents)
|
all_documents.extend(split_documents)
|
||||||
# processing qa document
|
# processing qa document
|
||||||
if document_form == 'qa_model':
|
if document_form == 'qa_model':
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import base64
|
import base64
|
||||||
import copy
|
import copy
|
||||||
import time
|
import time
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tiktoken
|
import tiktoken
|
||||||
@ -76,7 +76,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
|
|||||||
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
||||||
model=model,
|
model=model,
|
||||||
client=client,
|
client=client,
|
||||||
texts=[""],
|
texts="",
|
||||||
extra_model_kwargs=extra_model_kwargs
|
extra_model_kwargs=extra_model_kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -147,7 +147,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
|
|||||||
return ai_model_entity.entity
|
return ai_model_entity.entity
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _embedding_invoke(model: str, client: AzureOpenAI, texts: list[str],
|
def _embedding_invoke(model: str, client: AzureOpenAI, texts: Union[list[str], str],
|
||||||
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
|
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
|
||||||
response = client.embeddings.create(
|
response = client.embeddings.create(
|
||||||
input=texts,
|
input=texts,
|
||||||
|
@ -76,7 +76,7 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
|
|||||||
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
||||||
model=model,
|
model=model,
|
||||||
credentials=credentials,
|
credentials=credentials,
|
||||||
texts=[""]
|
texts=[" "]
|
||||||
)
|
)
|
||||||
|
|
||||||
used_tokens += embedding_used_tokens
|
used_tokens += embedding_used_tokens
|
||||||
@ -131,6 +131,9 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
|
|||||||
:param text: text to tokenize
|
:param text: text to tokenize
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
if not text:
|
||||||
|
return Tokens([], [], {})
|
||||||
|
|
||||||
# initialize client
|
# initialize client
|
||||||
client = cohere.Client(credentials.get('api_key'))
|
client = cohere.Client(credentials.get('api_key'))
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import base64
|
import base64
|
||||||
import time
|
import time
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tiktoken
|
import tiktoken
|
||||||
@ -89,7 +89,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
|
|||||||
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
embeddings_batch, embedding_used_tokens = self._embedding_invoke(
|
||||||
model=model,
|
model=model,
|
||||||
client=client,
|
client=client,
|
||||||
texts=[""],
|
texts="",
|
||||||
extra_model_kwargs=extra_model_kwargs
|
extra_model_kwargs=extra_model_kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -160,7 +160,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
|
|||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise CredentialsValidateFailedError(str(ex))
|
raise CredentialsValidateFailedError(str(ex))
|
||||||
|
|
||||||
def _embedding_invoke(self, model: str, client: OpenAI, texts: list[str],
|
def _embedding_invoke(self, model: str, client: OpenAI, texts: Union[list[str], str],
|
||||||
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
|
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
|
||||||
"""
|
"""
|
||||||
Invoke embedding model
|
Invoke embedding model
|
||||||
|
Loading…
x
Reference in New Issue
Block a user