Specify img2text model by tag (#5063)

### What problem does this PR solve? The current design is not well-suited for multimodal models, as each model can only be configured for a single purpose—either chat or Img2txt. To work around this limitation, we use model aliases such as gpt-4o-mini and gpt-4o-mini-2024-07-18. To fix this, this PR allows specifying the Img2txt model by tag instead of model_type. ### Type of change - [x] Refactoring
2025-08-12 06:28:58 +08:00 · 2025-02-18 11:14:48 +08:00 · 2025-02-18 11:14:48 +08:00 · 4694604836
commit 4694604836
parent 224c5472c8
2 changed files with 38 additions and 15 deletions
--- a/conf/llm_factories.json
+++ b/conf/llm_factories.json
@ -8,13 +8,13 @@
            "llm": [
                {
                    "llm_name": "gpt-4o-mini",
-                    "tags": "LLM,CHAT,128K",
+                    "tags": "LLM,CHAT,128K,IMAGE2TEXT",
                    "max_tokens": 128000,
                    "model_type": "chat"
                },
                {
                    "llm_name": "gpt-4o",
-                    "tags": "LLM,CHAT,128K",
+                    "tags": "LLM,CHAT,128K,IMAGE2TEXT",
                    "max_tokens": 128000,
                    "model_type": "chat"
                },
@ -72,18 +72,6 @@
                    "max_tokens": 32768,
                    "model_type": "chat"
                },
-                {
-                    "llm_name": "gpt-4o-2024-08-06",
-                    "tags": "LLM,CHAT,IMAGE2TEXT",
-                    "max_tokens": 128000,
-                    "model_type": "image2text"
-                },
-                 {
-                    "llm_name": "gpt-4o-mini-2024-07-18",
-                    "tags": "LLM,CHAT,IMAGE2TEXT",
-                    "max_tokens": 128000,
-                    "model_type": "image2text"
-                },
                {
                    "llm_name": "tts-1",
                    "tags": "TTS",
--- a/web/src/hooks/llm-hooks.tsx
+++ b/web/src/hooks/llm-hooks.tsx
@ -58,6 +58,41 @@ export const useSelectLlmOptions = () => {
 export const useSelectLlmOptionsByModelType = () => {
  const llmInfo: IThirdOAIModelCollection = useFetchLlmList();

+  const groupImage2TextOptions = () => {
+    const modelType = LlmModelType.Image2text;
+    const modelTag = modelType.toUpperCase();
+
+    return Object.entries(llmInfo)
+      .map(([key, value]) => {
+        return {
+          label: key,
+          options: value
+            .filter(
+              (x) =>
+                (x.model_type.includes(modelType) ||
+                  (x.tags && x.tags.includes(modelTag))) &&
+                x.available,
+            )
+            .map((x) => ({
+              label: (
+                <Flex align="center" gap={6}>
+                  <LlmIcon
+                    name={getLLMIconName(x.fid, x.llm_name)}
+                    width={26}
+                    height={26}
+                    size={'small'}
+                  />
+                  <span>{x.llm_name}</span>
+                </Flex>
+              ),
+              value: `${x.llm_name}@${x.fid}`,
+              disabled: !x.available,
+            })),
+        };
+      })
+      .filter((x) => x.options.length > 0);
+  };
+
  const groupOptionsByModelType = (modelType: LlmModelType) => {
    return Object.entries(llmInfo)
      .filter(([, value]) =>
@ -95,7 +130,7 @@ export const useSelectLlmOptionsByModelType = () => {
  return {
    [LlmModelType.Chat]: groupOptionsByModelType(LlmModelType.Chat),
    [LlmModelType.Embedding]: groupOptionsByModelType(LlmModelType.Embedding),
-    [LlmModelType.Image2text]: groupOptionsByModelType(LlmModelType.Image2text),
+    [LlmModelType.Image2text]: groupImage2TextOptions(),
    [LlmModelType.Speech2text]: groupOptionsByModelType(
      LlmModelType.Speech2text,
    ),