ragflow/rag/llm/rerank_model.py
Kevin Hu b8da2eeb69
Feat: support huggingface re-rank model. (#5684)
### What problem does this PR solve?

#5658

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-03-06 10:44:04 +08:00

599 lines
20 KiB
Python

#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
import threading
from urllib.parse import urljoin
import requests
import httpx
from huggingface_hub import snapshot_download
import os
from abc import ABC
import numpy as np
from yarl import URL
from api import settings
from api.utils.file_utils import get_home_cache_dir
from rag.utils import num_tokens_from_string, truncate
import json
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class Base(ABC):
def __init__(self, key, model_name):
pass
def similarity(self, query: str, texts: list):
raise NotImplementedError("Please implement encode method!")
def total_token_count(self, resp):
try:
return resp.usage.total_tokens
except Exception:
pass
try:
return resp["usage"]["total_tokens"]
except Exception:
pass
return 0
class DefaultRerank(Base):
_model = None
_model_lock = threading.Lock()
def __init__(self, key, model_name, **kwargs):
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
For Linux:
export HF_ENDPOINT=https://hf-mirror.com
For Windows:
Good luck
^_-
"""
if not settings.LIGHTEN and not DefaultRerank._model:
import torch
from FlagEmbedding import FlagReranker
with DefaultRerank._model_lock:
if not DefaultRerank._model:
try:
DefaultRerank._model = FlagReranker(
os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)),
use_fp16=torch.cuda.is_available())
except Exception:
model_dir = snapshot_download(repo_id=model_name,
local_dir=os.path.join(get_home_cache_dir(),
re.sub(r"^[a-zA-Z0-9]+/", "", model_name)),
local_dir_use_symlinks=False)
DefaultRerank._model = FlagReranker(model_dir, use_fp16=torch.cuda.is_available())
self._model = DefaultRerank._model
self._dynamic_batch_size = 8
self._min_batch_size = 1
def torch_empty_cache(self):
try:
import torch
torch.cuda.empty_cache()
except Exception as e:
print(f"Error emptying cache: {e}")
def _process_batch(self, pairs, max_batch_size=None):
"""template method for subclass call"""
old_dynamic_batch_size = self._dynamic_batch_size
if max_batch_size is not None:
self._dynamic_batch_size = max_batch_size
res = []
i = 0
while i < len(pairs):
current_batch = self._dynamic_batch_size
max_retries = 5
retry_count = 0
while retry_count < max_retries:
try:
# call subclass implemented batch processing calculation
batch_scores = self._compute_batch_scores(pairs[i:i + current_batch])
res.extend(batch_scores)
i += current_batch
self._dynamic_batch_size = min(self._dynamic_batch_size * 2, 8)
break
except RuntimeError as e:
if "CUDA out of memory" in str(e) and current_batch > self._min_batch_size:
current_batch = max(current_batch // 2, self._min_batch_size)
self.torch_empty_cache()
retry_count += 1
else:
raise
if retry_count >= max_retries:
raise RuntimeError("max retry times, still cannot process batch, please check your GPU memory")
self.torch_empty_cache()
self._dynamic_batch_size = old_dynamic_batch_size
return np.array(res)
def _compute_batch_scores(self, batch_pairs, max_length=None):
if max_length is None:
scores = self._model.compute_score(batch_pairs)
else:
scores = self._model.compute_score(batch_pairs, max_length=max_length)
scores = sigmoid(np.array(scores)).tolist()
return scores
def similarity(self, query: str, texts: list):
pairs = [(query, truncate(t, 2048)) for t in texts]
token_count = 0
for _, t in pairs:
token_count += num_tokens_from_string(t)
batch_size = 4096
res = self._process_batch(pairs, max_batch_size=batch_size)
return np.array(res), token_count
class JinaRerank(Base):
def __init__(self, key, model_name="jina-reranker-v2-base-multilingual",
base_url="https://api.jina.ai/v1/rerank"):
self.base_url = "https://api.jina.ai/v1/rerank"
self.headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {key}"
}
self.model_name = model_name
def similarity(self, query: str, texts: list):
texts = [truncate(t, 8196) for t in texts]
data = {
"model": self.model_name,
"query": query,
"documents": texts,
"top_n": len(texts)
}
res = requests.post(self.base_url, headers=self.headers, json=data).json()
rank = np.zeros(len(texts), dtype=float)
for d in res["results"]:
rank[d["index"]] = d["relevance_score"]
return rank, self.total_token_count(res)
class YoudaoRerank(DefaultRerank):
_model = None
_model_lock = threading.Lock()
def __init__(self, key=None, model_name="maidalun1020/bce-reranker-base_v1", **kwargs):
if not settings.LIGHTEN and not YoudaoRerank._model:
from BCEmbedding import RerankerModel
with YoudaoRerank._model_lock:
if not YoudaoRerank._model:
try:
YoudaoRerank._model = RerankerModel(model_name_or_path=os.path.join(
get_home_cache_dir(),
re.sub(r"^[a-zA-Z0-9]+/", "", model_name)))
except Exception:
YoudaoRerank._model = RerankerModel(
model_name_or_path=model_name.replace(
"maidalun1020", "InfiniFlow"))
self._model = YoudaoRerank._model
def similarity(self, query: str, texts: list):
pairs = [(query, truncate(t, self._model.max_length)) for t in texts]
token_count = 0
for _, t in pairs:
token_count += num_tokens_from_string(t)
batch_size = 8
res = self._process_batch(pairs, max_batch_size=batch_size)
return np.array(res), token_count
class XInferenceRerank(Base):
def __init__(self, key="xxxxxxx", model_name="", base_url=""):
if base_url.find("/v1") == -1:
base_url = urljoin(base_url, "/v1/rerank")
if base_url.find("/rerank") == -1:
base_url = urljoin(base_url, "/v1/rerank")
self.model_name = model_name
self.base_url = base_url
self.headers = {
"Content-Type": "application/json",
"accept": "application/json",
"Authorization": f"Bearer {key}"
}
def similarity(self, query: str, texts: list):
if len(texts) == 0:
return np.array([]), 0
pairs = [(query, truncate(t, 4096)) for t in texts]
token_count = 0
for _, t in pairs:
token_count += num_tokens_from_string(t)
data = {
"model": self.model_name,
"query": query,
"return_documents": "true",
"return_len": "true",
"documents": texts
}
res = requests.post(self.base_url, headers=self.headers, json=data).json()
rank = np.zeros(len(texts), dtype=float)
for d in res["results"]:
rank[d["index"]] = d["relevance_score"]
return rank, token_count
class LocalAIRerank(Base):
def __init__(self, key, model_name, base_url):
if base_url.find("/rerank") == -1:
self.base_url = urljoin(base_url, "/rerank")
else:
self.base_url = base_url
self.headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {key}"
}
self.model_name = model_name.split("___")[0]
def similarity(self, query: str, texts: list):
# noway to config Ragflow , use fix setting
texts = [truncate(t, 500) for t in texts]
data = {
"model": self.model_name,
"query": query,
"documents": texts,
"top_n": len(texts),
}
token_count = 0
for t in texts:
token_count += num_tokens_from_string(t)
res = requests.post(self.base_url, headers=self.headers, json=data).json()
rank = np.zeros(len(texts), dtype=float)
if 'results' not in res:
raise ValueError("response not contains results\n" + str(res))
for d in res["results"]:
rank[d["index"]] = d["relevance_score"]
# Normalize the rank values to the range 0 to 1
min_rank = np.min(rank)
max_rank = np.max(rank)
# Avoid division by zero if all ranks are identical
if max_rank - min_rank != 0:
rank = (rank - min_rank) / (max_rank - min_rank)
else:
rank = np.zeros_like(rank)
return rank, token_count
class NvidiaRerank(Base):
def __init__(
self, key, model_name, base_url="https://ai.api.nvidia.com/v1/retrieval/nvidia/"
):
if not base_url:
base_url = "https://ai.api.nvidia.com/v1/retrieval/nvidia/"
self.model_name = model_name
if self.model_name == "nvidia/nv-rerankqa-mistral-4b-v3":
self.base_url = os.path.join(
base_url, "nv-rerankqa-mistral-4b-v3", "reranking"
)
if self.model_name == "nvidia/rerank-qa-mistral-4b":
self.base_url = os.path.join(base_url, "reranking")
self.model_name = "nv-rerank-qa-mistral-4b:1"
self.headers = {
"accept": "application/json",
"Content-Type": "application/json",
"Authorization": f"Bearer {key}",
}
def similarity(self, query: str, texts: list):
token_count = num_tokens_from_string(query) + sum(
[num_tokens_from_string(t) for t in texts]
)
data = {
"model": self.model_name,
"query": {"text": query},
"passages": [{"text": text} for text in texts],
"truncate": "END",
"top_n": len(texts),
}
res = requests.post(self.base_url, headers=self.headers, json=data).json()
rank = np.zeros(len(texts), dtype=float)
for d in res["rankings"]:
rank[d["index"]] = d["logit"]
return rank, token_count
class LmStudioRerank(Base):
def __init__(self, key, model_name, base_url):
pass
def similarity(self, query: str, texts: list):
raise NotImplementedError("The LmStudioRerank has not been implement")
class OpenAI_APIRerank(Base):
def __init__(self, key, model_name, base_url):
if base_url.find("/rerank") == -1:
self.base_url = urljoin(base_url, "/rerank")
else:
self.base_url = base_url
self.headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {key}"
}
self.model_name = model_name.split("___")[0]
def similarity(self, query: str, texts: list):
# noway to config Ragflow , use fix setting
texts = [truncate(t, 500) for t in texts]
data = {
"model": self.model_name,
"query": query,
"documents": texts,
"top_n": len(texts),
}
token_count = 0
for t in texts:
token_count += num_tokens_from_string(t)
res = requests.post(self.base_url, headers=self.headers, json=data).json()
rank = np.zeros(len(texts), dtype=float)
if 'results' not in res:
raise ValueError("response not contains results\n" + str(res))
for d in res["results"]:
rank[d["index"]] = d["relevance_score"]
# Normalize the rank values to the range 0 to 1
min_rank = np.min(rank)
max_rank = np.max(rank)
# Avoid division by zero if all ranks are identical
if max_rank - min_rank != 0:
rank = (rank - min_rank) / (max_rank - min_rank)
else:
rank = np.zeros_like(rank)
return rank, token_count
class CoHereRerank(Base):
def __init__(self, key, model_name, base_url=None):
from cohere import Client
self.client = Client(api_key=key)
self.model_name = model_name
def similarity(self, query: str, texts: list):
token_count = num_tokens_from_string(query) + sum(
[num_tokens_from_string(t) for t in texts]
)
res = self.client.rerank(
model=self.model_name,
query=query,
documents=texts,
top_n=len(texts),
return_documents=False,
)
rank = np.zeros(len(texts), dtype=float)
for d in res.results:
rank[d.index] = d.relevance_score
return rank, token_count
class TogetherAIRerank(Base):
def __init__(self, key, model_name, base_url):
pass
def similarity(self, query: str, texts: list):
raise NotImplementedError("The api has not been implement")
class SILICONFLOWRerank(Base):
def __init__(
self, key, model_name, base_url="https://api.siliconflow.cn/v1/rerank"
):
if not base_url:
base_url = "https://api.siliconflow.cn/v1/rerank"
self.model_name = model_name
self.base_url = base_url
self.headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": f"Bearer {key}",
}
def similarity(self, query: str, texts: list):
payload = {
"model": self.model_name,
"query": query,
"documents": texts,
"top_n": len(texts),
"return_documents": False,
"max_chunks_per_doc": 1024,
"overlap_tokens": 80,
}
response = requests.post(
self.base_url, json=payload, headers=self.headers
).json()
rank = np.zeros(len(texts), dtype=float)
if "results" not in response:
return rank, 0
for d in response["results"]:
rank[d["index"]] = d["relevance_score"]
return (
rank,
response["meta"]["tokens"]["input_tokens"] + response["meta"]["tokens"]["output_tokens"],
)
class BaiduYiyanRerank(Base):
def __init__(self, key, model_name, base_url=None):
from qianfan.resources import Reranker
key = json.loads(key)
ak = key.get("yiyan_ak", "")
sk = key.get("yiyan_sk", "")
self.client = Reranker(ak=ak, sk=sk)
self.model_name = model_name
def similarity(self, query: str, texts: list):
res = self.client.do(
model=self.model_name,
query=query,
documents=texts,
top_n=len(texts),
).body
rank = np.zeros(len(texts), dtype=float)
for d in res["results"]:
rank[d["index"]] = d["relevance_score"]
return rank, self.total_token_count(res)
class VoyageRerank(Base):
def __init__(self, key, model_name, base_url=None):
import voyageai
self.client = voyageai.Client(api_key=key)
self.model_name = model_name
def similarity(self, query: str, texts: list):
rank = np.zeros(len(texts), dtype=float)
if not texts:
return rank, 0
res = self.client.rerank(
query=query, documents=texts, model=self.model_name, top_k=len(texts)
)
for r in res.results:
rank[r.index] = r.relevance_score
return rank, res.total_tokens
class QWenRerank(Base):
def __init__(self, key, model_name='gte-rerank', base_url=None, **kwargs):
import dashscope
self.api_key = key
self.model_name = dashscope.TextReRank.Models.gte_rerank if model_name is None else model_name
def similarity(self, query: str, texts: list):
import dashscope
from http import HTTPStatus
resp = dashscope.TextReRank.call(
api_key=self.api_key,
model=self.model_name,
query=query,
documents=texts,
top_n=len(texts),
return_documents=False
)
rank = np.zeros(len(texts), dtype=float)
if resp.status_code == HTTPStatus.OK:
for r in resp.output.results:
rank[r.index] = r.relevance_score
return rank, resp.usage.total_tokens
else:
raise ValueError(f"Error calling QWenRerank model {self.model_name}: {resp.status_code} - {resp.text}")
class HuggingfaceRerank(DefaultRerank):
@staticmethod
def post(query: str, texts: list, url="127.0.0.1"):
exc = None
scores = [0 for _ in range(len(texts))]
batch_size = 8
for i in range(0, len(texts), batch_size):
try:
res = requests.post(f"http://{url}/rerank", headers={"Content-Type": "application/json"},
json={"query": query, "texts": texts[i: i + batch_size],
"raw_scores": False, "truncate": True})
for o in res.json():
scores[o["index"] + i] = o["score"]
except Exception as e:
exc = e
if exc:
raise exc
return np.array(scores)
def __init__(self, key, model_name="BAAI/bge-reranker-v2-m3", base_url="http://127.0.0.1"):
self.model_name = model_name
self.base_url = base_url
def similarity(self, query: str, texts: list) -> tuple[np.ndarray, int]:
if not texts:
return np.array([]), 0
token_count = 0
for t in texts:
token_count += num_tokens_from_string(t)
return HuggingfaceRerank.post(query, texts, self.base_url), token_count
class GPUStackRerank(Base):
def __init__(
self, key, model_name, base_url
):
if not base_url:
raise ValueError("url cannot be None")
self.model_name = model_name
self.base_url = str(URL(base_url) / "v1" / "rerank")
self.headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": f"Bearer {key}",
}
def similarity(self, query: str, texts: list):
payload = {
"model": self.model_name,
"query": query,
"documents": texts,
"top_n": len(texts),
}
try:
response = requests.post(
self.base_url, json=payload, headers=self.headers
)
response.raise_for_status()
response_json = response.json()
rank = np.zeros(len(texts), dtype=float)
if "results" not in response_json:
return rank, 0
token_count = 0
for t in texts:
token_count += num_tokens_from_string(t)
for result in response_json["results"]:
rank[result["index"]] = result["relevance_score"]
return (
rank,
token_count,
)
except httpx.HTTPStatusError as e:
raise ValueError(
f"Error calling GPUStackRerank model {self.model_name}: {e.response.status_code} - {e.response.text}")