Specify img2text model by tag (#5063)

### What problem does this PR solve?

The current design is not well-suited for multimodal models, as each
model can only be configured for a single purpose—either chat or
Img2txt. To work around this limitation, we use model aliases such as
gpt-4o-mini and gpt-4o-mini-2024-07-18.

To fix this, this PR allows specifying the Img2txt model by tag instead
of model_type.

### Type of change
- [x] Refactoring
This commit is contained in:
petertc 2025-02-18 11:14:48 +08:00 committed by GitHub
parent 224c5472c8
commit 4694604836
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 38 additions and 15 deletions

View File

@ -8,13 +8,13 @@
"llm": [ "llm": [
{ {
"llm_name": "gpt-4o-mini", "llm_name": "gpt-4o-mini",
"tags": "LLM,CHAT,128K", "tags": "LLM,CHAT,128K,IMAGE2TEXT",
"max_tokens": 128000, "max_tokens": 128000,
"model_type": "chat" "model_type": "chat"
}, },
{ {
"llm_name": "gpt-4o", "llm_name": "gpt-4o",
"tags": "LLM,CHAT,128K", "tags": "LLM,CHAT,128K,IMAGE2TEXT",
"max_tokens": 128000, "max_tokens": 128000,
"model_type": "chat" "model_type": "chat"
}, },
@ -72,18 +72,6 @@
"max_tokens": 32768, "max_tokens": 32768,
"model_type": "chat" "model_type": "chat"
}, },
{
"llm_name": "gpt-4o-2024-08-06",
"tags": "LLM,CHAT,IMAGE2TEXT",
"max_tokens": 128000,
"model_type": "image2text"
},
{
"llm_name": "gpt-4o-mini-2024-07-18",
"tags": "LLM,CHAT,IMAGE2TEXT",
"max_tokens": 128000,
"model_type": "image2text"
},
{ {
"llm_name": "tts-1", "llm_name": "tts-1",
"tags": "TTS", "tags": "TTS",

View File

@ -58,6 +58,41 @@ export const useSelectLlmOptions = () => {
export const useSelectLlmOptionsByModelType = () => { export const useSelectLlmOptionsByModelType = () => {
const llmInfo: IThirdOAIModelCollection = useFetchLlmList(); const llmInfo: IThirdOAIModelCollection = useFetchLlmList();
const groupImage2TextOptions = () => {
const modelType = LlmModelType.Image2text;
const modelTag = modelType.toUpperCase();
return Object.entries(llmInfo)
.map(([key, value]) => {
return {
label: key,
options: value
.filter(
(x) =>
(x.model_type.includes(modelType) ||
(x.tags && x.tags.includes(modelTag))) &&
x.available,
)
.map((x) => ({
label: (
<Flex align="center" gap={6}>
<LlmIcon
name={getLLMIconName(x.fid, x.llm_name)}
width={26}
height={26}
size={'small'}
/>
<span>{x.llm_name}</span>
</Flex>
),
value: `${x.llm_name}@${x.fid}`,
disabled: !x.available,
})),
};
})
.filter((x) => x.options.length > 0);
};
const groupOptionsByModelType = (modelType: LlmModelType) => { const groupOptionsByModelType = (modelType: LlmModelType) => {
return Object.entries(llmInfo) return Object.entries(llmInfo)
.filter(([, value]) => .filter(([, value]) =>
@ -95,7 +130,7 @@ export const useSelectLlmOptionsByModelType = () => {
return { return {
[LlmModelType.Chat]: groupOptionsByModelType(LlmModelType.Chat), [LlmModelType.Chat]: groupOptionsByModelType(LlmModelType.Chat),
[LlmModelType.Embedding]: groupOptionsByModelType(LlmModelType.Embedding), [LlmModelType.Embedding]: groupOptionsByModelType(LlmModelType.Embedding),
[LlmModelType.Image2text]: groupOptionsByModelType(LlmModelType.Image2text), [LlmModelType.Image2text]: groupImage2TextOptions(),
[LlmModelType.Speech2text]: groupOptionsByModelType( [LlmModelType.Speech2text]: groupOptionsByModelType(
LlmModelType.Speech2text, LlmModelType.Speech2text,
), ),