Cut down the attempt times of ES (#3550)

### What problem does this PR solve?

#3541
### Type of change


- [x] Refactoring
- [x] Performance Improvement
This commit is contained in:
Kevin Hu 2024-11-21 11:37:45 +08:00 committed by GitHub
parent 58a2200b80
commit 0ac6dc8f8c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -16,13 +16,15 @@ from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr,
FusionExpr FusionExpr
from rag.nlp import is_english, rag_tokenizer from rag.nlp import is_english, rag_tokenizer
ATTEMPT_TIME = 2
@singleton @singleton
class ESConnection(DocStoreConnection): class ESConnection(DocStoreConnection):
def __init__(self): def __init__(self):
self.info = {} self.info = {}
logging.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.") logging.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.")
for _ in range(24): for _ in range(ATTEMPT_TIME):
try: try:
self.es = Elasticsearch( self.es = Elasticsearch(
settings.ES["hosts"].split(","), settings.ES["hosts"].split(","),
@ -92,7 +94,7 @@ class ESConnection(DocStoreConnection):
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool: def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
s = Index(indexName, self.es) s = Index(indexName, self.es)
for i in range(3): for i in range(ATTEMPT_TIME):
try: try:
return s.exists() return s.exists()
except Exception as e: except Exception as e:
@ -144,9 +146,9 @@ class ESConnection(DocStoreConnection):
if "minimum_should_match" in m.extra_options: if "minimum_should_match" in m.extra_options:
minimum_should_match = str(int(m.extra_options["minimum_should_match"] * 100)) + "%" minimum_should_match = str(int(m.extra_options["minimum_should_match"] * 100)) + "%"
bqry.must.append(Q("query_string", fields=m.fields, bqry.must.append(Q("query_string", fields=m.fields,
type="best_fields", query=m.matching_text, type="best_fields", query=m.matching_text,
minimum_should_match=minimum_should_match, minimum_should_match=minimum_should_match,
boost=1)) boost=1))
bqry.boost = 1.0 - vector_similarity_weight bqry.boost = 1.0 - vector_similarity_weight
elif isinstance(m, MatchDenseExpr): elif isinstance(m, MatchDenseExpr):
@ -180,7 +182,7 @@ class ESConnection(DocStoreConnection):
q = s.to_dict() q = s.to_dict()
logging.debug(f"ESConnection.search {str(indexNames)} query: " + json.dumps(q)) logging.debug(f"ESConnection.search {str(indexNames)} query: " + json.dumps(q))
for i in range(3): for i in range(ATTEMPT_TIME):
try: try:
res = self.es.search(index=indexNames, res = self.es.search(index=indexNames,
body=q, body=q,
@ -201,7 +203,7 @@ class ESConnection(DocStoreConnection):
raise Exception("ESConnection.search timeout.") raise Exception("ESConnection.search timeout.")
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None: def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
for i in range(3): for i in range(ATTEMPT_TIME):
try: try:
res = self.es.get(index=(indexName), res = self.es.get(index=(indexName),
id=chunkId, source=True, ) id=chunkId, source=True, )
@ -233,7 +235,7 @@ class ESConnection(DocStoreConnection):
operations.append(d_copy) operations.append(d_copy)
res = [] res = []
for _ in range(100): for _ in range(ATTEMPT_TIME):
try: try:
r = self.es.bulk(index=(indexName), operations=operations, r = self.es.bulk(index=(indexName), operations=operations,
refresh=False, timeout="600s") refresh=False, timeout="600s")
@ -258,7 +260,7 @@ class ESConnection(DocStoreConnection):
if "id" in condition and isinstance(condition["id"], str): if "id" in condition and isinstance(condition["id"], str):
# update specific single document # update specific single document
chunkId = condition["id"] chunkId = condition["id"]
for i in range(3): for i in range(ATTEMPT_TIME):
try: try:
self.es.update(index=indexName, id=chunkId, doc=doc) self.es.update(index=indexName, id=chunkId, doc=doc)
return True return True
@ -326,7 +328,7 @@ class ESConnection(DocStoreConnection):
else: else:
raise Exception("Condition value must be int, str or list.") raise Exception("Condition value must be int, str or list.")
logging.debug("ESConnection.delete query: " + json.dumps(qry.to_dict())) logging.debug("ESConnection.delete query: " + json.dumps(qry.to_dict()))
for _ in range(10): for _ in range(ATTEMPT_TIME):
try: try:
res = self.es.delete_by_query( res = self.es.delete_by_query(
index=indexName, index=indexName,
@ -437,7 +439,7 @@ class ESConnection(DocStoreConnection):
sql = sql.replace(p, r, 1) sql = sql.replace(p, r, 1)
logging.debug(f"ESConnection.sql to es: {sql}") logging.debug(f"ESConnection.sql to es: {sql}")
for i in range(3): for i in range(ATTEMPT_TIME):
try: try:
res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format,
request_timeout="2s") request_timeout="2s")