diff --git a/api/core/memory/token_buffer_memory.py b/api/core/memory/token_buffer_memory.py
index 282cd9b36f..81d08dc885 100644
--- a/api/core/memory/token_buffer_memory.py
+++ b/api/core/memory/token_buffer_memory.py
@@ -3,7 +3,6 @@ from typing import Optional
from core.app.app_config.features.file_upload.manager import FileUploadConfigManager
from core.file import file_manager
-from core.file.models import FileType
from core.model_manager import ModelInstance
from core.model_runtime.entities import (
AssistantPromptMessage,
@@ -103,12 +102,11 @@ class TokenBufferMemory:
prompt_message_contents: list[PromptMessageContent] = []
prompt_message_contents.append(TextPromptMessageContent(data=message.query))
for file in file_objs:
- if file.type in {FileType.IMAGE, FileType.AUDIO}:
- prompt_message = file_manager.to_prompt_message_content(
- file,
- image_detail_config=detail,
- )
- prompt_message_contents.append(prompt_message)
+ prompt_message = file_manager.to_prompt_message_content(
+ file,
+ image_detail_config=detail,
+ )
+ prompt_message_contents.append(prompt_message)
prompt_messages.append(UserPromptMessage(content=prompt_message_contents))
diff --git a/api/core/model_runtime/entities/message_entities.py b/api/core/model_runtime/entities/message_entities.py
index a7e3db0032..e86fb37522 100644
--- a/api/core/model_runtime/entities/message_entities.py
+++ b/api/core/model_runtime/entities/message_entities.py
@@ -49,7 +49,7 @@ class PromptMessageFunction(BaseModel):
function: PromptMessageTool
-class PromptMessageContentType(Enum):
+class PromptMessageContentType(str, Enum):
"""
Enum class for prompt message content type.
"""
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-001.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-001.yaml
index 2e68fa8e6f..43f4e4787d 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-001.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-001.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 1048576
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-002.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-002.yaml
index 9f44504e89..7b9add6af1 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-002.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-002.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 1048576
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0827.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0827.yaml
index a3da9095e1..d6de82012e 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0827.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 1048576
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0924.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0924.yaml
index 19373e4993..23b8d318fc 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0924.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0924.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 1048576
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-exp-0827.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-exp-0827.yaml
index ca1f0b39b2..9762706cd7 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-exp-0827.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 1048576
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-latest.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-latest.yaml
index 24e8c3a74f..b9739d068e 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-latest.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-latest.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 1048576
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash.yaml
index fa3e814fc3..d8ab4efc91 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 1048576
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-001.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-001.yaml
index da125e6fab..05184823e4 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-001.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-001.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 2097152
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-002.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-002.yaml
index f683e54d3b..548fe6ddb2 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-002.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-002.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 2097152
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0801.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0801.yaml
index c67c156bdb..defab26acf 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0801.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0801.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 2097152
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0827.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0827.yaml
index 56059fd799..9cbc889f17 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0827.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 2097152
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-latest.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-latest.yaml
index ec376f3186..e5aefcdb99 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-latest.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-latest.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 2097152
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro.yaml
index 8394cdfb56..00bd3e8d99 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 2097152
diff --git a/api/core/model_runtime/model_providers/google/llm/gemini-exp-1114.yaml b/api/core/model_runtime/model_providers/google/llm/gemini-exp-1114.yaml
index 2d4965ad25..0515e706c2 100644
--- a/api/core/model_runtime/model_providers/google/llm/gemini-exp-1114.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-exp-1114.yaml
@@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
+ - document
model_properties:
mode: chat
context_size: 32767
diff --git a/api/core/model_runtime/model_providers/google/llm/llm.py b/api/core/model_runtime/model_providers/google/llm/llm.py
index 754f056ac1..77e0801b63 100644
--- a/api/core/model_runtime/model_providers/google/llm/llm.py
+++ b/api/core/model_runtime/model_providers/google/llm/llm.py
@@ -16,6 +16,7 @@ from PIL import Image
from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta
from core.model_runtime.entities.message_entities import (
AssistantPromptMessage,
+ DocumentPromptMessageContent,
ImagePromptMessageContent,
PromptMessage,
PromptMessageContentType,
@@ -35,6 +36,21 @@ from core.model_runtime.errors.invoke import (
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
+GOOGLE_AVAILABLE_MIMETYPE = [
+ "application/pdf",
+ "application/x-javascript",
+ "text/javascript",
+ "application/x-python",
+ "text/x-python",
+ "text/plain",
+ "text/html",
+ "text/css",
+ "text/md",
+ "text/csv",
+ "text/xml",
+ "text/rtf",
+]
+
class GoogleLargeLanguageModel(LargeLanguageModel):
def _invoke(
@@ -370,6 +386,12 @@ class GoogleLargeLanguageModel(LargeLanguageModel):
raise ValueError(f"Failed to fetch image data from url {message_content.data}, {ex}")
blob = {"inline_data": {"mime_type": mime_type, "data": base64_data}}
glm_content["parts"].append(blob)
+ elif c.type == PromptMessageContentType.DOCUMENT:
+ message_content = cast(DocumentPromptMessageContent, c)
+ if message_content.mime_type not in GOOGLE_AVAILABLE_MIMETYPE:
+ raise ValueError(f"Unsupported mime type {message_content.mime_type}")
+ blob = {"inline_data": {"mime_type": message_content.mime_type, "data": message_content.data}}
+ glm_content["parts"].append(blob)
return glm_content
elif isinstance(message, AssistantPromptMessage):
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max-0809.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max-0809.yaml
index 50e10226a5..94b6666d05 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max-0809.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max-0809.yaml
@@ -6,6 +6,7 @@ model_type: llm
features:
- vision
- agent-thought
+ - video
model_properties:
mode: chat
context_size: 32000
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
index 21b127f56c..b6172c1cbc 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
@@ -6,6 +6,7 @@ model_type: llm
features:
- vision
- agent-thought
+ - video
model_properties:
mode: chat
context_size: 32000
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus-0809.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus-0809.yaml
index 67b2b2ebdd..0be4b68f4f 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus-0809.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus-0809.yaml
@@ -6,6 +6,7 @@ model_type: llm
features:
- vision
- agent-thought
+ - video
model_properties:
mode: chat
context_size: 32768
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
index f55764c6c0..6c8a8121c6 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
@@ -6,6 +6,7 @@ model_type: llm
features:
- vision
- agent-thought
+ - video
model_properties:
mode: chat
context_size: 8000
diff --git a/api/core/model_runtime/model_providers/zhipuai/llm/glm_4v_plus.yaml b/api/core/model_runtime/model_providers/zhipuai/llm/glm_4v_plus.yaml
index 91550ceee8..dbda18b888 100644
--- a/api/core/model_runtime/model_providers/zhipuai/llm/glm_4v_plus.yaml
+++ b/api/core/model_runtime/model_providers/zhipuai/llm/glm_4v_plus.yaml
@@ -6,6 +6,7 @@ model_properties:
mode: chat
features:
- vision
+ - video
parameter_rules:
- name: temperature
use_template: temperature
diff --git a/api/core/workflow/nodes/llm/exc.py b/api/core/workflow/nodes/llm/exc.py
index b5207d5573..6599221691 100644
--- a/api/core/workflow/nodes/llm/exc.py
+++ b/api/core/workflow/nodes/llm/exc.py
@@ -26,9 +26,15 @@ class NoPromptFoundError(LLMNodeError):
"""Raised when no prompt is found in the LLM configuration."""
-class NotSupportedPromptTypeError(LLMNodeError):
- """Raised when the prompt type is not supported."""
+class TemplateTypeNotSupportError(LLMNodeError):
+ def __init__(self, *, type_name: str):
+ super().__init__(f"Prompt type {type_name} is not supported.")
class MemoryRolePrefixRequiredError(LLMNodeError):
"""Raised when memory role prefix is required for completion model."""
+
+
+class FileTypeNotSupportError(LLMNodeError):
+ def __init__(self, *, type_name: str):
+ super().__init__(f"{type_name} type is not supported by this model")
diff --git a/api/core/workflow/nodes/llm/node.py b/api/core/workflow/nodes/llm/node.py
index 0cb53ee9d3..2529c76942 100644
--- a/api/core/workflow/nodes/llm/node.py
+++ b/api/core/workflow/nodes/llm/node.py
@@ -65,6 +65,7 @@ from .entities import (
ModelConfig,
)
from .exc import (
+ FileTypeNotSupportError,
InvalidContextStructureError,
InvalidVariableTypeError,
LLMModeRequiredError,
@@ -72,7 +73,7 @@ from .exc import (
MemoryRolePrefixRequiredError,
ModelNotExistError,
NoPromptFoundError,
- NotSupportedPromptTypeError,
+ TemplateTypeNotSupportError,
VariableNotFoundError,
)
@@ -621,9 +622,7 @@ class LLMNode(BaseNode[LLMNodeData]):
prompt_content = prompt_messages[0].content.replace("#sys.query#", user_query)
prompt_messages[0].content = prompt_content
else:
- errmsg = f"Prompt type {type(prompt_template)} is not supported"
- logger.warning(errmsg)
- raise NotSupportedPromptTypeError(errmsg)
+ raise TemplateTypeNotSupportError(type_name=str(type(prompt_template)))
if vision_enabled and user_files:
file_prompts = []
@@ -671,7 +670,7 @@ class LLMNode(BaseNode[LLMNodeData]):
and ModelFeature.AUDIO not in model_config.model_schema.features
)
):
- continue
+ raise FileTypeNotSupportError(type_name=content_item.type)
prompt_message_content.append(content_item)
if len(prompt_message_content) == 1 and prompt_message_content[0].type == PromptMessageContentType.TEXT:
prompt_message.content = prompt_message_content[0].data
diff --git a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
index a1f9ece0d1..9a24d35a1f 100644
--- a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
+++ b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
@@ -400,59 +400,6 @@ def test_fetch_prompt_messages__basic(faker, llm_node, model_config):
)
},
),
- LLMNodeTestScenario(
- description="Prompt template with variable selector of File without vision feature",
- user_query=fake_query,
- user_files=[],
- vision_enabled=True,
- vision_detail=fake_vision_detail,
- features=[],
- window_size=fake_window_size,
- prompt_template=[
- LLMNodeChatModelMessage(
- text="{{#input.image#}}",
- role=PromptMessageRole.USER,
- edition_type="basic",
- ),
- ],
- expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)],
- file_variables={
- "input.image": File(
- tenant_id="test",
- type=FileType.IMAGE,
- filename="test1.jpg",
- transfer_method=FileTransferMethod.REMOTE_URL,
- remote_url=fake_remote_url,
- )
- },
- ),
- LLMNodeTestScenario(
- description="Prompt template with variable selector of File with video file and vision feature",
- user_query=fake_query,
- user_files=[],
- vision_enabled=True,
- vision_detail=fake_vision_detail,
- features=[ModelFeature.VISION],
- window_size=fake_window_size,
- prompt_template=[
- LLMNodeChatModelMessage(
- text="{{#input.image#}}",
- role=PromptMessageRole.USER,
- edition_type="basic",
- ),
- ],
- expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)],
- file_variables={
- "input.image": File(
- tenant_id="test",
- type=FileType.VIDEO,
- filename="test1.mp4",
- transfer_method=FileTransferMethod.REMOTE_URL,
- remote_url=fake_remote_url,
- extension="mp4",
- )
- },
- ),
]
for scenario in test_scenarios:
diff --git a/web/app/components/app/configuration/config-vision/index.tsx b/web/app/components/app/configuration/config-vision/index.tsx
index 23f00d46d8..f30d3e4a0a 100644
--- a/web/app/components/app/configuration/config-vision/index.tsx
+++ b/web/app/components/app/configuration/config-vision/index.tsx
@@ -12,34 +12,46 @@ import ConfigContext from '@/context/debug-configuration'
// import { Resolution } from '@/types/app'
import { useFeatures, useFeaturesStore } from '@/app/components/base/features/hooks'
import Switch from '@/app/components/base/switch'
-import type { FileUpload } from '@/app/components/base/features/types'
+import { SupportUploadFileTypes } from '@/app/components/workflow/types'
const ConfigVision: FC = () => {
const { t } = useTranslation()
- const { isShowVisionConfig } = useContext(ConfigContext)
+ const { isShowVisionConfig, isAllowVideoUpload } = useContext(ConfigContext)
const file = useFeatures(s => s.features.file)
const featuresStore = useFeaturesStore()
- const handleChange = useCallback((data: FileUpload) => {
+ const isImageEnabled = file?.allowed_file_types?.includes(SupportUploadFileTypes.image) ?? false
+
+ const handleChange = useCallback((value: boolean) => {
const {
features,
setFeatures,
} = featuresStore!.getState()
const newFeatures = produce(features, (draft) => {
- draft.file = {
- ...draft.file,
- enabled: data.enabled,
- image: {
- enabled: data.enabled,
- detail: data.image?.detail,
- transfer_methods: data.image?.transfer_methods,
- number_limits: data.image?.number_limits,
- },
+ if (value) {
+ draft.file!.allowed_file_types = Array.from(new Set([
+ ...(draft.file?.allowed_file_types || []),
+ SupportUploadFileTypes.image,
+ ...(isAllowVideoUpload ? [SupportUploadFileTypes.video] : []),
+ ]))
+ }
+ else {
+ draft.file!.allowed_file_types = draft.file!.allowed_file_types?.filter(
+ type => type !== SupportUploadFileTypes.image && (isAllowVideoUpload ? type !== SupportUploadFileTypes.video : true),
+ )
+ }
+
+ if (draft.file) {
+ draft.file.enabled = (draft.file.allowed_file_types?.length ?? 0) > 0
+ draft.file.image = {
+ ...(draft.file.image || {}),
+ enabled: value,
+ }
}
})
setFeatures(newFeatures)
- }, [featuresStore])
+ }, [featuresStore, isAllowVideoUpload])
if (!isShowVisionConfig)
return null
@@ -89,11 +101,8 @@ const ConfigVision: FC = () => {