Specify img2text model by tag (#5063)

### What problem does this PR solve? The current design is not well-suited for multimodal models, as each model can only be configured for a single purpose—either chat or Img2txt. To work around this limitation, we use model aliases such as gpt-4o-mini and gpt-4o-mini-2024-07-18. To fix this, this PR allows specifying the Img2txt model by tag instead of model_type. ### Type of change - [x] Refactoring
2025-08-12 06:28:58 +08:00 · 2025-02-18 11:14:48 +08:00 · 2025-02-18 11:14:48 +08:00 · 4694604836
commit 4694604836
parent 224c5472c8
2 changed files with 38 additions and 15 deletions
--- a/conf/llm_factories.json
+++ b/conf/llm_factories.json
@ -8,13 +8,13 @@
            "llm": [
                {
                    "llm_name": "gpt-4o-mini",
-                    "tags": "LLM,CHAT,128K",
+                    "tags": "LLM,CHAT,128K,IMAGE2TEXT",
                    "max_tokens": 128000,
                    "model_type": "chat"
                },
                {
                    "llm_name": "gpt-4o",
-                    "tags": "LLM,CHAT,128K",
+                    "tags": "LLM,CHAT,128K,IMAGE2TEXT",
                    "max_tokens": 128000,
                    "model_type": "chat"
                },
@ -72,18 +72,6 @@
                    "max_tokens": 32768,
                    "model_type": "chat"
                },
                {
                    "llm_name": "gpt-4o-2024-08-06",
                    "tags": "LLM,CHAT,IMAGE2TEXT",
                    "max_tokens": 128000,
                    "model_type": "image2text"
                },
                 {
                    "llm_name": "gpt-4o-mini-2024-07-18",
                    "tags": "LLM,CHAT,IMAGE2TEXT",
                    "max_tokens": 128000,
                    "model_type": "image2text"
                },
                {
                    "llm_name": "tts-1",
                    "tags": "TTS",
--- a/web/src/hooks/llm-hooks.tsx
+++ b/web/src/hooks/llm-hooks.tsx
@ -58,6 +58,41 @@ export const useSelectLlmOptions = () => {
 export const useSelectLlmOptionsByModelType = () => {
  const llmInfo: IThirdOAIModelCollection = useFetchLlmList();
  const groupImage2TextOptions = () => {
    const modelType = LlmModelType.Image2text;
    const modelTag = modelType.toUpperCase();
    return Object.entries(llmInfo)
      .map(([key, value]) => {
        return {
          label: key,
          options: value
            .filter(
              (x) =>
                (x.model_type.includes(modelType) ||
                  (x.tags && x.tags.includes(modelTag))) &&
                x.available,
            )
            .map((x) => ({
              label: (
                <Flex align="center" gap={6}>
                  <LlmIcon
                    name={getLLMIconName(x.fid, x.llm_name)}
                    width={26}
                    height={26}
                    size={'small'}
                  />
                  <span>{x.llm_name}</span>
                </Flex>
              ),
              value: `${x.llm_name}@${x.fid}`,
              disabled: !x.available,
            })),
        };
      })
      .filter((x) => x.options.length > 0);
  };
  const groupOptionsByModelType = (modelType: LlmModelType) => {
    return Object.entries(llmInfo)
      .filter(([, value]) =>
@ -95,7 +130,7 @@ export const useSelectLlmOptionsByModelType = () => {
  return {
    [LlmModelType.Chat]: groupOptionsByModelType(LlmModelType.Chat),
    [LlmModelType.Embedding]: groupOptionsByModelType(LlmModelType.Embedding),
-    [LlmModelType.Image2text]: groupOptionsByModelType(LlmModelType.Image2text),
+    [LlmModelType.Image2text]: groupImage2TextOptions(),
    [LlmModelType.Speech2text]: groupOptionsByModelType(
      LlmModelType.Speech2text,
    ),