diff --git a/conf/llm_factories.json b/conf/llm_factories.json index f0aa28cf7..586b81f06 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -1920,7 +1920,7 @@ { "llm_name": "step-1v-8k", "tags": "LLM,CHAT,IMAGE2TEXT", - "max_tokens": 8000, + "max_tokens": 8192, "model_type": "image2text" } ] diff --git a/rag/llm/__init__.py b/rag/llm/__init__.py index a91c18ec9..3ebb230e2 100644 --- a/rag/llm/__init__.py +++ b/rag/llm/__init__.py @@ -52,7 +52,8 @@ CvModel = { "OpenRouter": OpenRouterCV, "LocalAI": LocalAICV, "NVIDIA": NvidiaCV, - "LM-Studio": LmStudioCV + "LM-Studio": LmStudioCV, + "StepFun":StepFunCV } diff --git a/rag/llm/cv_model.py b/rag/llm/cv_model.py index 742eb9e64..baf93fd89 100644 --- a/rag/llm/cv_model.py +++ b/rag/llm/cv_model.py @@ -622,6 +622,26 @@ class NvidiaCV(Base): } ] +class StepFunCV(Base): + def __init__(self, key, model_name="step-1v-8k", lang="Chinese", base_url="https://api.stepfun.com/v1"): + if not base_url: base_url="https://api.stepfun.com/v1" + self.client = OpenAI(api_key=key, base_url=base_url) + self.model_name = model_name + self.lang = lang + + def describe(self, image, max_tokens=4096): + b64 = self.image2base64(image) + prompt = self.prompt(b64) + for i in range(len(prompt)): + for c in prompt[i]["content"]: + if "text" in c: c["type"] = "text" + + res = self.client.chat.completions.create( + model=self.model_name, + messages=prompt, + max_tokens=max_tokens, + ) + return res.choices[0].message.content.strip(), res.usage.total_tokens class LmStudioCV(GptV4): def __init__(self, key, model_name, base_url, lang="Chinese"):