From b4a281eca168eedfc842bb94c8429ecd63efadb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E8=85=BE?= <101850389+hangters@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:43:09 +0800 Subject: [PATCH] add support for NVIDIA llm (#1645) ### What problem does this PR solve? add support for NVIDIA llm ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Zhedong Cen --- conf/llm_factories.json | 284 ++++++++++++++++++ rag/llm/__init__.py | 12 +- rag/llm/chat_model.py | 78 ++++- rag/llm/cv_model.py | 63 +++- rag/llm/embedding_model.py | 38 +++ rag/llm/rerank_model.py | 38 +++ web/src/assets/svg/llm/nvidia.svg | 1 + .../user-setting/setting-model/constant.ts | 1 + 8 files changed, 508 insertions(+), 7 deletions(-) create mode 100644 web/src/assets/svg/llm/nvidia.svg diff --git a/conf/llm_factories.json b/conf/llm_factories.json index 5eb125542..402577a3d 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -1918,6 +1918,290 @@ "model_type": "chat" } ] + }, + { + "name": "NVIDIA", + "logo": "", + "tags": "LLM,TEXT EMBEDDING, TEXT RE-RANK", + "status": "1", + "llm": [ + { + "llm_name": "nvidia/nemotron-4-340b-reward", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "aisingapore/sea-lion-7b-instruct", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "databricks/dbrx-instruct", + "tags": "LLM,CHAT,16K", + "max_tokens": 16384, + "model_type": "chat" + }, + { + "llm_name": "google/gemma-7b", + "tags": "LLM,CHAT,32K", + "max_tokens": 32768, + "model_type": "chat" + }, + { + "llm_name": "google/gemma-2b", + "tags": "LLM,CHAT,16K", + "max_tokens": 16384, + "model_type": "chat" + }, + { + "llm_name": "google/gemma-2-9b-it", + "tags": "LLM,CHAT,8K", + "max_tokens": 8192, + "model_type": "chat" + }, + { + "llm_name": "google/gemma-2-27b-it", + "tags": "LLM,CHAT,8K", + "max_tokens": 8192, + "model_type": "chat" + }, + { + "llm_name": "google/recurrentgemma-2b", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "mediatek/breeze-7b-instruct", + "tags": "LLM,CHAT,8K", + "max_tokens": 8192, + "model_type": "chat" + }, + { + "llm_name": "meta/llama2-70b", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "meta/llama3-8b", + "tags": "LLM,CHAT,8K", + "max_tokens": 8192, + "model_type": "chat" + }, + { + "llm_name": "meta/llama3-70b", + "tags": "LLM,CHAT,8K", + "max_tokens": 8192, + "model_type": "chat" + }, + { + "llm_name": "microsoft/phi-3-medium-128k-instruct", + "tags": "LLM,CHAT,128K", + "max_tokens": 131072, + "model_type": "chat" + }, + { + "llm_name": "microsoft/phi-3-medium-4k-instruct", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "microsoftphi-3-mini-128k-instruct", + "tags": "LLM,CHAT,128K", + "max_tokens": 131072, + "model_type": "chat" + }, + { + "llm_name": "microsoft/phi-3-mini-4k-instruct", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "microsoft/phi-3-small-128k-instruct", + "tags": "LLM,CHAT,128K", + "max_tokens": 131072, + "model_type": "chat" + }, + { + "llm_name": "microsoft/phi-3-small-8k-instruct", + "tags": "LLM,CHAT,8K", + "max_tokens": 8192, + "model_type": "chat" + }, + { + "llm_name": "mistralai/mistral-7b-instruct", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "mistralai/mistral-7b-instruct-v0.3", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "mistralai/mixtral-8x7b-instruct", + "tags": "LLM,CHAT,32K", + "max_tokens": 32768, + "model_type": "chat" + }, + { + "llm_name": "mistralai/mixtral-8x22b-instruct", + "tags": "LLM,CHAT,64K", + "max_tokens": 65536, + "model_type": "chat" + }, + { + "llm_name": "mistralai/mistral-large", + "tags": "LLM,CHAT,32K", + "max_tokens": 32768, + "model_type": "chat" + }, + { + "llm_name": "nv-mistralai/mistral-nemo-12b-instruct", + "tags": "LLM,CHAT,128K", + "max_tokens": 131072, + "model_type": "chat" + }, + { + "llm_name": "nvidia/llama3-chatqa-1.5-70b", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "nvidia/llama3-chatqa-1.5-8b", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "nvidia/nemotron-4-340b-instruct", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "seallms/seallm-7b-v2.5", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "snowflake/arctic", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "upstage/solar-10.7b-instruct", + "tags": "LLM,CHAT,4K", + "max_tokens": 4096, + "model_type": "chat" + }, + { + "llm_name": "baai/bge-m3", + "tags": "TEXT EMBEDDING,8K", + "max_tokens": 8192, + "model_type": "embedding" + }, + { + "llm_name": "nvidia/embed-qa-4", + "tags": "TEXT EMBEDDING,512", + "max_tokens": 512, + "model_type": "embedding" + }, + { + "llm_name": "nvidia/nv-embed-v1", + "tags": "TEXT EMBEDDING,32K", + "max_tokens": 32768, + "model_type": "embedding" + }, + { + "llm_name": "nvidia/nv-embedqa-e5-v5", + "tags": "TEXT EMBEDDING,512", + "max_tokens": 512, + "model_type": "embedding" + }, + { + "llm_name": "nvidia/nv-embedqa-mistral-7b-v2", + "tags": "TEXT EMBEDDING,512", + "max_tokens": 512, + "model_type": "embedding" + }, + { + "llm_name": "nvidia/nv-rerankqa-mistral-4b-v3", + "tags": "RE-RANK,512", + "max_tokens": 512, + "model_type": "rerank" + }, + { + "llm_name": "nvidia/rerank-qa-mistral-4b", + "tags": "RE-RANK,512", + "max_tokens": 512, + "model_type": "rerank" + }, + { + "llm_name": "snowflake/arctic-embed-l", + "tags": "TEXT EMBEDDING,512", + "max_tokens": 512, + "model_type": "embedding" + }, + { + "llm_name": "adept/fuyu-8b", + "tags": "LLM,IMAGE2TEXT,4K", + "max_tokens": 4096, + "model_type": "image2text" + }, + { + "llm_name": "google/deplot", + "tags": "LLM,IMAGE2TEXT,4K", + "max_tokens": 4096, + "model_type": "image2text" + }, + { + "llm_name": "google/paligemma", + "tags": "LLM,IMAGE2TEXT,4K", + "max_tokens": 4096, + "model_type": "image2text" + }, + { + "llm_name": "Iiuhaotian/Ilava-v1.6-34b", + "tags": "LLM,IMAGE2TEXT,4K", + "max_tokens": 4096, + "model_type": "image2text" + }, + { + "llm_name": "Iiuhaotian/Ilava-v1.6-mistral-7b", + "tags": "LLM,IMAGE2TEXT,4K", + "max_tokens": 4096, + "model_type": "image2text" + }, + { + "llm_name": "microsoft/kosmos-2", + "tags": "LLM,IMAGE2TEXT,4K", + "max_tokens": 4096, + "model_type": "image2text" + }, + { + "llm_name": "microsoft/phi-3-vision-128k-instruct", + "tags": "LLM,IMAGE2TEXT,128K", + "max_tokens": 131072, + "model_type": "image2text" + }, + { + "llm_name": "nvidia/neva-22b", + "tags": "LLM,IMAGE2TEXT,4K", + "max_tokens": 4096, + "model_type": "image2text" + } + ] } ] } \ No newline at end of file diff --git a/rag/llm/__init__.py b/rag/llm/__init__.py index 50e2938a3..1ba1e56f8 100644 --- a/rag/llm/__init__.py +++ b/rag/llm/__init__.py @@ -34,7 +34,8 @@ EmbeddingModel = { "BAAI": DefaultEmbedding, "Mistral": MistralEmbed, "Bedrock": BedrockEmbed, - "Gemini":GeminiEmbed + "Gemini":GeminiEmbed, + "NVIDIA":NvidiaEmbed } @@ -48,7 +49,8 @@ CvModel = { "Moonshot": LocalCV, 'Gemini':GeminiCV, 'OpenRouter':OpenRouterCV, - "LocalAI":LocalAICV + "LocalAI":LocalAICV, + "NVIDIA":NvidiaCV } @@ -71,7 +73,8 @@ ChatModel = { "Bedrock": BedrockChat, "Groq": GroqChat, 'OpenRouter':OpenRouterChat, - "StepFun":StepFunChat + "StepFun":StepFunChat, + "NVIDIA":NvidiaChat } @@ -79,7 +82,8 @@ RerankModel = { "BAAI": DefaultRerank, "Jina": JinaRerank, "Youdao": YoudaoRerank, - "Xinference": XInferenceRerank + "Xinference": XInferenceRerank, + "NVIDIA":NvidiaRerank } diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index f52872926..351548719 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -581,7 +581,6 @@ class MiniMaxChat(Base): response = requests.request( "POST", url=self.base_url, headers=headers, data=payload ) - print(response, flush=True) response = response.json() ans = response["choices"][0]["message"]["content"].strip() if response["choices"][0]["finish_reason"] == "length": @@ -902,4 +901,79 @@ class StepFunChat(Base): def __init__(self, key, model_name, base_url="https://api.stepfun.com/v1/chat/completions"): if not base_url: base_url = "https://api.stepfun.com/v1/chat/completions" - super().__init__(key, model_name, base_url) \ No newline at end of file + super().__init__(key, model_name, base_url) + + +class NvidiaChat(Base): + def __init__( + self, + key, + model_name, + base_url="https://integrate.api.nvidia.com/v1/chat/completions", + ): + if not base_url: + base_url = "https://integrate.api.nvidia.com/v1/chat/completions" + self.base_url = base_url + self.model_name = model_name + self.api_key = key + self.headers = { + "accept": "application/json", + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + def chat(self, system, history, gen_conf): + if system: + history.insert(0, {"role": "system", "content": system}) + for k in list(gen_conf.keys()): + if k not in ["temperature", "top_p", "max_tokens"]: + del gen_conf[k] + payload = {"model": self.model_name, "messages": history, **gen_conf} + try: + response = requests.post( + url=self.base_url, headers=self.headers, json=payload + ) + response = response.json() + ans = response["choices"][0]["message"]["content"].strip() + return ans, response["usage"]["total_tokens"] + except Exception as e: + return "**ERROR**: " + str(e), 0 + + def chat_streamly(self, system, history, gen_conf): + if system: + history.insert(0, {"role": "system", "content": system}) + for k in list(gen_conf.keys()): + if k not in ["temperature", "top_p", "max_tokens"]: + del gen_conf[k] + ans = "" + total_tokens = 0 + payload = { + "model": self.model_name, + "messages": history, + "stream": True, + **gen_conf, + } + + try: + response = requests.post( + url=self.base_url, + headers=self.headers, + json=payload, + ) + for resp in response.text.split("\n\n"): + if "choices" not in resp: + continue + resp = json.loads(resp[6:]) + if "content" in resp["choices"][0]["delta"]: + text = resp["choices"][0]["delta"]["content"] + else: + continue + ans += text + if "usage" in resp: + total_tokens = resp["usage"]["total_tokens"] + yield ans + + except Exception as e: + yield ans + "\n**ERROR**: " + str(e) + + yield total_tokens diff --git a/rag/llm/cv_model.py b/rag/llm/cv_model.py index 5867ce2e8..b40c0ba1c 100644 --- a/rag/llm/cv_model.py +++ b/rag/llm/cv_model.py @@ -137,7 +137,6 @@ class Base(ABC): ] - class GptV4(Base): def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", base_url="https://api.openai.com/v1"): if not base_url: base_url="https://api.openai.com/v1" @@ -619,3 +618,65 @@ class LocalCV(Base): def describe(self, image, max_tokens=1024): return "", 0 + + +class NvidiaCV(Base): + def __init__( + self, + key, + model_name, + lang="Chinese", + base_url="https://ai.api.nvidia.com/v1/vlm", + ): + if not base_url: + base_url = ("https://ai.api.nvidia.com/v1/vlm",) + self.lang = lang + factory, llm_name = model_name.split("/") + if factory != "liuhaotian": + self.base_url = os.path.join(base_url, factory, llm_name) + else: + self.base_url = os.path.join( + base_url, "community", llm_name.replace("-v1.6", "16") + ) + self.key = key + + def describe(self, image, max_tokens=1024): + b64 = self.image2base64(image) + response = requests.post( + url=self.base_url, + headers={ + "accept": "application/json", + "content-type": "application/json", + "Authorization": f"Bearer {self.key}", + }, + json={ + "messages": self.prompt(b64), + "max_tokens": max_tokens, + }, + ) + response = response.json() + return ( + response["choices"][0]["message"]["content"].strip(), + response["usage"]["total_tokens"], + ) + + def prompt(self, b64): + return [ + { + "role": "user", + "content": ( + "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" + if self.lang.lower() == "chinese" + else "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out." + ) + + f' ', + } + ] + + def chat_prompt(self, text, b64): + return [ + { + "role": "user", + "content": text + f' ', + } + ] diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index d1290981d..8b5387308 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -462,3 +462,41 @@ class GeminiEmbed(Base): title="Embedding of single string") token_count = num_tokens_from_string(text) return np.array(result['embedding']),token_count + +class NvidiaEmbed(Base): + def __init__( + self, key, model_name, base_url="https://integrate.api.nvidia.com/v1/embeddings" + ): + if not base_url: + base_url = "https://integrate.api.nvidia.com/v1/embeddings" + self.api_key = key + self.base_url = base_url + self.headers = { + "accept": "application/json", + "Content-Type": "application/json", + "authorization": f"Bearer {self.api_key}", + } + self.model_name = model_name + if model_name == "nvidia/embed-qa-4": + self.base_url = "https://ai.api.nvidia.com/v1/retrieval/nvidia/embeddings" + self.model_name = "NV-Embed-QA" + if model_name == "snowflake/arctic-embed-l": + self.base_url = "https://ai.api.nvidia.com/v1/retrieval/snowflake/arctic-embed-l/embeddings" + + def encode(self, texts: list, batch_size=None): + payload = { + "input": texts, + "input_type": "query", + "model": self.model_name, + "encoding_format": "float", + "truncate": "END", + } + res = requests.post(self.base_url, headers=self.headers, json=payload).json() + return ( + np.array([d["embedding"] for d in res["data"]]), + res["usage"]["total_tokens"], + ) + + def encode_queries(self, text): + embds, cnt = self.encode([text]) + return np.array(embds[0]), cnt diff --git a/rag/llm/rerank_model.py b/rag/llm/rerank_model.py index 4cf23bc4c..3a24da0b8 100644 --- a/rag/llm/rerank_model.py +++ b/rag/llm/rerank_model.py @@ -164,3 +164,41 @@ class LocalAIRerank(Base): def similarity(self, query: str, texts: list): raise NotImplementedError("The LocalAIRerank has not been implement") + + +class NvidiaRerank(Base): + def __init__( + self, key, model_name, base_url="https://ai.api.nvidia.com/v1/retrieval/nvidia/" + ): + if not base_url: + base_url = "https://ai.api.nvidia.com/v1/retrieval/nvidia/" + self.model_name = model_name + + if self.model_name == "nvidia/nv-rerankqa-mistral-4b-v3": + self.base_url = os.path.join( + base_url, "nv-rerankqa-mistral-4b-v3", "reranking" + ) + + if self.model_name == "nvidia/rerank-qa-mistral-4b": + self.base_url = os.path.join(base_url, "reranking") + self.model_name = "nv-rerank-qa-mistral-4b:1" + + self.headers = { + "accept": "application/json", + "Content-Type": "application/json", + "Authorization": f"Bearer {key}", + } + + def similarity(self, query: str, texts: list): + token_count = num_tokens_from_string(query) + sum( + [num_tokens_from_string(t) for t in texts] + ) + data = { + "model": self.model_name, + "query": {"text": query}, + "passages": [{"text": text} for text in texts], + "truncate": "END", + "top_n": len(texts), + } + res = requests.post(self.base_url, headers=self.headers, json=data).json() + return (np.array([d["logit"] for d in res["rankings"]]), token_count) diff --git a/web/src/assets/svg/llm/nvidia.svg b/web/src/assets/svg/llm/nvidia.svg new file mode 100644 index 000000000..217afaac9 --- /dev/null +++ b/web/src/assets/svg/llm/nvidia.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/web/src/pages/user-setting/setting-model/constant.ts b/web/src/pages/user-setting/setting-model/constant.ts index 3923c752b..88bb61e51 100644 --- a/web/src/pages/user-setting/setting-model/constant.ts +++ b/web/src/pages/user-setting/setting-model/constant.ts @@ -20,6 +20,7 @@ export const IconMap = { OpenRouter: 'open-router', LocalAI: 'local-ai', StepFun: 'stepfun', + NVIDIA:'nvidia' }; export const BedrockRegionList = [