Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
This commit is contained in:
Zhichang Yu 2024-11-14 17:13:48 +08:00 committed by GitHub
parent ab4384e011
commit 30f6421760
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
75 changed files with 396 additions and 402 deletions

View File

@ -13,13 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import json
from abc import ABC
from copy import deepcopy
from functools import partial
from agent.component import component_class
from agent.component.base import ComponentBase
from api.utils.log_utils import logger
class Canvas(ABC):
"""
@ -187,7 +187,7 @@ class Canvas(ABC):
if cpn.component_name == "Answer":
self.answer.append(c)
else:
logger.debug(f"Canvas.prepare2run: {c}")
logging.debug(f"Canvas.prepare2run: {c}")
cpids = cpn.get_dependent_components()
if any([c not in self.path[-1] for c in cpids]):
continue
@ -197,7 +197,7 @@ class Canvas(ABC):
prepare2run(self.components[self.path[-2][-1]]["downstream"])
while 0 <= ran < len(self.path[-1]):
logger.debug(f"Canvas.run: {ran} {self.path}")
logging.debug(f"Canvas.run: {ran} {self.path}")
cpn_id = self.path[-1][ran]
cpn = self.get_component(cpn_id)
if not cpn["downstream"]: break
@ -217,7 +217,7 @@ class Canvas(ABC):
self.get_component(p)["obj"].set_exception(e)
prepare2run([p])
break
logger.exception("Canvas.run got exception")
logging.exception("Canvas.run got exception")
break
continue
@ -229,7 +229,7 @@ class Canvas(ABC):
self.get_component(p)["obj"].set_exception(e)
prepare2run([p])
break
logger.exception("Canvas.run got exception")
logging.exception("Canvas.run got exception")
break
if self.answer:

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import arxiv
import pandas as pd
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class ArXivParam(ComponentParamBase):
"""
@ -64,5 +64,5 @@ class ArXiv(ComponentBase, ABC):
return ArXiv.be_output("")
df = pd.DataFrame(arxiv_res)
logger.debug(f"df: {str(df)}")
logging.debug(f"df: {str(df)}")
return df

View File

@ -13,12 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import pandas as pd
import requests
import re
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class BaiduParam(ComponentParamBase):
@ -62,6 +62,6 @@ class Baidu(ComponentBase, ABC):
return Baidu.be_output("")
df = pd.DataFrame(baidu_res)
logger.debug(f"df: {str(df)}")
logging.debug(f"df: {str(df)}")
return df

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import builtins
import json
@ -23,7 +24,6 @@ from typing import Tuple, Union
import pandas as pd
from agent import settings
from api.utils.log_utils import logger
_FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params"
@ -361,13 +361,13 @@ class ComponentParamBase(ABC):
def _warn_deprecated_param(self, param_name, descr):
if self._deprecated_params_set.get(param_name):
logger.warning(
logging.warning(
f"{descr} {param_name} is deprecated and ignored in this version."
)
def _warn_to_deprecate_param(self, param_name, descr, new_param):
if self._deprecated_params_set.get(param_name):
logger.warning(
logging.warning(
f"{descr} {param_name} will be deprecated in future release; "
f"please use {new_param} instead."
)
@ -403,7 +403,7 @@ class ComponentBase(ABC):
return cpnts
def run(self, history, **kwargs):
logger.info("{}, history: {}, kwargs: {}".format(self, json.dumps(history, ensure_ascii=False),
logging.debug("{}, history: {}, kwargs: {}".format(self, json.dumps(history, ensure_ascii=False),
json.dumps(kwargs, ensure_ascii=False)))
try:
res = self._run(history, **kwargs)
@ -476,7 +476,7 @@ class ComponentBase(ABC):
reversed_cpnts.extend(self._canvas.path[-2])
reversed_cpnts.extend(self._canvas.path[-1])
logger.debug(f"{self.component_name} {reversed_cpnts[::-1]}")
logging.debug(f"{self.component_name} {reversed_cpnts[::-1]}")
for u in reversed_cpnts[::-1]:
if self.get_component_name(u) in ["switch", "concentrator"]: continue
if self.component_name.lower() == "generate" and self.get_component_name(u) == "retrieval":

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import requests
import pandas as pd
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class BingParam(ComponentParamBase):
"""
@ -80,5 +80,5 @@ class Bing(ComponentBase, ABC):
return Bing.be_output("")
df = pd.DataFrame(bing_res)
logger.debug(f"df: {str(df)}")
logging.debug(f"df: {str(df)}")
return df

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from agent.component import GenerateParam, Generate
from api.utils.log_utils import logger
class CategorizeParam(GenerateParam):
@ -77,7 +77,7 @@ class Categorize(Generate, ABC):
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": input}],
self._param.gen_conf())
logger.debug(f"input: {input}, answer: {str(ans)}")
logging.debug(f"input: {input}, answer: {str(ans)}")
for c in self._param.category_description.keys():
if ans.lower().find(c.lower()) >= 0:
return Categorize.be_output(self._param.category_description[c]["to"])

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
from duckduckgo_search import DDGS
import pandas as pd
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class DuckDuckGoParam(ComponentParamBase):
@ -62,5 +62,5 @@ class DuckDuckGo(ComponentBase, ABC):
return DuckDuckGo.be_output("")
df = pd.DataFrame(duck_res)
logger.debug("df: {df}")
logging.debug("df: {df}")
return df

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import pandas as pd
import requests
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class GitHubParam(ComponentParamBase):
@ -57,5 +57,5 @@ class GitHub(ComponentBase, ABC):
return GitHub.be_output("")
df = pd.DataFrame(github_res)
logger.debug(f"df: {df}")
logging.debug(f"df: {df}")
return df

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
from serpapi import GoogleSearch
import pandas as pd
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class GoogleParam(ComponentParamBase):
@ -92,5 +92,5 @@ class Google(ComponentBase, ABC):
return Google.be_output("")
df = pd.DataFrame(google_res)
logger.debug(f"df: {df}")
logging.debug(f"df: {df}")
return df

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import pandas as pd
from agent.component.base import ComponentBase, ComponentParamBase
from scholarly import scholarly
from api.utils.log_utils import logger
class GoogleScholarParam(ComponentParamBase):
@ -59,12 +59,12 @@ class GoogleScholar(ComponentBase, ABC):
'bib'].get('abstract', 'no abstract')})
except StopIteration or Exception:
logger.exception("GoogleScholar")
logging.exception("GoogleScholar")
break
if not scholar_res:
return GoogleScholar.be_output("")
df = pd.DataFrame(scholar_res)
logger.debug(f"df: {df}")
logging.debug(f"df: {df}")
return df

View File

@ -13,12 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
from abc import ABC
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from agent.component import GenerateParam, Generate
from api.utils.log_utils import logger
class KeywordExtractParam(GenerateParam):
@ -58,5 +58,5 @@ class KeywordExtract(Generate, ABC):
self._param.gen_conf())
ans = re.sub(r".*keyword:", "", ans).strip()
logger.info(f"ans: {ans}")
logging.debug(f"ans: {ans}")
return KeywordExtract.be_output(ans)

View File

@ -13,13 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
from Bio import Entrez
import re
import pandas as pd
import xml.etree.ElementTree as ET
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class PubMedParam(ComponentParamBase):
@ -65,5 +65,5 @@ class PubMed(ComponentBase, ABC):
return PubMed.be_output("")
df = pd.DataFrame(pubmed_res)
logger.debug(f"df: {df}")
logging.debug(f"df: {df}")
return df

View File

@ -13,12 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from agent.component import GenerateParam, Generate
from rag.utils import num_tokens_from_string, encoder
from api.utils.log_utils import logger
class RelevantParam(GenerateParam):
@ -71,7 +71,7 @@ class Relevant(Generate, ABC):
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": ans}],
self._param.gen_conf())
logger.info(ans)
logging.debug(ans)
if ans.lower().find("yes") >= 0:
return Relevant.be_output(self._param.yes)
if ans.lower().find("no") >= 0:

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import pandas as pd
@ -22,7 +23,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api.settings import retrievaler
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class RetrievalParam(ComponentParamBase):
@ -81,7 +81,7 @@ class Retrieval(ComponentBase, ABC):
df = pd.DataFrame(kbinfos["chunks"])
df["content"] = df["content_with_weight"]
del df["content_with_weight"]
logger.debug("{} {}".format(query, df))
logging.debug("{} {}".format(query, df))
return df

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from agent.component import GenerateParam, Generate
from api.utils.log_utils import logger
class RewriteQuestionParam(GenerateParam):
@ -105,7 +105,7 @@ class RewriteQuestion(Generate, ABC):
self._canvas.history.pop()
self._canvas.history.append(("user", ans))
logger.info(ans)
logging.debug(ans)
return RewriteQuestion.be_output(ans)

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import wikipedia
import pandas as pd
from agent.component.base import ComponentBase, ComponentParamBase
from api.utils.log_utils import logger
class WikipediaParam(ComponentParamBase):
@ -63,5 +63,5 @@ class Wikipedia(ComponentBase, ABC):
return Wikipedia.be_output("")
df = pd.DataFrame(wiki_res)
logger.debug(f"df: {df}")
logging.debug(f"df: {df}")
return df

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from abc import ABC
import pandas as pd
from agent.component.base import ComponentBase, ComponentParamBase
import yfinance as yf
from api.utils.log_utils import logger
class YahooFinanceParam(ComponentParamBase):
@ -76,7 +76,7 @@ class YahooFinance(ComponentBase, ABC):
if self._param.news:
yohoo_res.append({"content": "news:\n" + pd.DataFrame(msft.news).to_markdown() + "\n"})
except Exception:
logger.exception("YahooFinance got exception")
logging.exception("YahooFinance got exception")
if not yohoo_res:
return YahooFinance.be_output("")

View File

@ -15,6 +15,7 @@
#
import os
import sys
import logging
from importlib.util import module_from_spec, spec_from_file_location
from pathlib import Path
from flask import Blueprint, Flask
@ -32,7 +33,6 @@ from flask_login import LoginManager
from api.settings import SECRET_KEY
from api.settings import API_VERSION
from api.utils.api_utils import server_error_response
from api.utils.log_utils import logger
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
__all__ = ["app"]
@ -154,7 +154,7 @@ def load_user(web_request):
else:
return None
except Exception:
logger.exception("load_user got exception")
logging.exception("load_user got exception")
return None
else:
return None

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import json
from functools import partial
from flask import request, Response
@ -23,7 +24,6 @@ from api.utils import get_uuid
from api.utils.api_utils import get_json_result, server_error_response, validate_request, get_data_error_result
from agent.canvas import Canvas
from peewee import MySQLDatabase, PostgresqlDatabase
from api.utils.log_utils import logger
@manager.route('/templates', methods=['GET'])
@ -115,7 +115,7 @@ def run():
pass
canvas.add_user_input(req["message"])
answer = canvas.run(stream=stream)
logger.info(canvas)
logging.debug(canvas)
except Exception as e:
return server_error_response(e)

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import json
from flask import request
@ -25,7 +26,6 @@ from api.db.db_models import TenantLLM
from api.utils.api_utils import get_json_result
from rag.llm import EmbeddingModel, ChatModel, RerankModel, CvModel, TTSModel
import requests
from api.utils.log_utils import logger
@manager.route('/factories', methods=['GET'])
@ -90,7 +90,7 @@ def set_api_key():
if len(arr) == 0 or tc == 0:
raise Exception("Fail")
rerank_passed = True
logger.info(f'passed model rerank {llm.llm_name}')
logging.debug(f'passed model rerank {llm.llm_name}')
except Exception as e:
msg += f"\nFail to access model({llm.llm_name}) using this api key." + str(
e)

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import json
import re
from datetime import datetime
@ -54,7 +55,6 @@ from api.settings import (
from api.db.services.user_service import UserService, TenantService, UserTenantService
from api.db.services.file_service import FileService
from api.utils.api_utils import get_json_result, construct_response
from api.utils.log_utils import logger
@manager.route("/login", methods=["POST", "GET"])
@ -177,7 +177,7 @@ def github_callback():
try:
avatar = download_img(user_info["avatar_url"])
except Exception as e:
logger.exception(e)
logging.exception(e)
avatar = ""
users = user_register(
user_id,
@ -202,7 +202,7 @@ def github_callback():
return redirect("/?auth=%s" % user.get_id())
except Exception as e:
rollback_user_registration(user_id)
logger.exception(e)
logging.exception(e)
return redirect("/?error=%s" % str(e))
# User has already registered, try to log in
@ -279,7 +279,7 @@ def feishu_callback():
try:
avatar = download_img(user_info["avatar_url"])
except Exception as e:
logger.exception(e)
logging.exception(e)
avatar = ""
users = user_register(
user_id,
@ -304,7 +304,7 @@ def feishu_callback():
return redirect("/?auth=%s" % user.get_id())
except Exception as e:
rollback_user_registration(user_id)
logger.exception(e)
logging.exception(e)
return redirect("/?error=%s" % str(e))
# User has already registered, try to log in
@ -436,7 +436,7 @@ def setting_user():
UserService.update_by_id(current_user.id, update_dict)
return get_json_result(data=True)
except Exception as e:
logger.exception(e)
logging.exception(e)
return get_json_result(
data=False, message="Update failure!", code=RetCode.EXCEPTION_ERROR
)
@ -621,7 +621,7 @@ def user_add():
)
except Exception as e:
rollback_user_registration(user_id)
logger.exception(e)
logging.exception(e)
return get_json_result(
data=False,
message=f"User registration failure, error: {str(e)}",

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import inspect
import os
import sys
@ -32,7 +33,6 @@ from playhouse.pool import PooledMySQLDatabase, PooledPostgresqlDatabase
from api.db import SerializedType, ParserType
from api.settings import DATABASE, SECRET_KEY, DATABASE_TYPE
from api import utils
from api.utils.log_utils import logger
def singleton(cls, *args, **kw):
instances = {}
@ -285,7 +285,7 @@ class BaseDataBase:
database_config = DATABASE.copy()
db_name = database_config.pop("name")
self.database_connection = PooledDatabase[DATABASE_TYPE.upper()].value(db_name, **database_config)
logger.info('init database on cluster mode successfully')
logging.info('init database on cluster mode successfully')
class PostgresDatabaseLock:
def __init__(self, lock_name, timeout=10, db=None):
@ -393,7 +393,7 @@ def close_connection():
if DB:
DB.close_stale(age=30)
except Exception as e:
logger.exception(e)
logging.exception(e)
class DataBaseModel(BaseModel):
@ -409,15 +409,15 @@ def init_database_tables(alter_fields=[]):
for name, obj in members:
if obj != DataBaseModel and issubclass(obj, DataBaseModel):
table_objs.append(obj)
logger.info(f"start create table {obj.__name__}")
logging.debug(f"start create table {obj.__name__}")
try:
obj.create_table()
logger.info(f"create table success: {obj.__name__}")
logging.debug(f"create table success: {obj.__name__}")
except Exception as e:
logger.exception(e)
logging.exception(e)
create_failed_list.append(obj.__name__)
if create_failed_list:
logger.info(f"create tables failed: {create_failed_list}")
logging.error(f"create tables failed: {create_failed_list}")
raise Exception(f"create tables failed: {create_failed_list}")
migrate_db()

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import base64
import json
import os
@ -30,7 +31,6 @@ from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantL
from api.db.services.user_service import TenantService, UserTenantService
from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
def encode_to_base64(input_string):
@ -70,26 +70,26 @@ def init_superuser():
"api_key": API_KEY, "api_base": LLM_BASE_URL})
if not UserService.save(**user_info):
logger.info("can't init admin.")
logging.error("can't init admin.")
return
TenantService.insert(**tenant)
UserTenantService.insert(**usr_tenant)
TenantLLMService.insert_many(tenant_llm)
logger.info(
"Super user initialized. email: admin@ragflow.io, password: admin. Changing the password after logining is strongly recomanded.")
logging.info(
"Super user initialized. email: admin@ragflow.io, password: admin. Changing the password after login is strongly recommended.")
chat_mdl = LLMBundle(tenant["id"], LLMType.CHAT, tenant["llm_id"])
msg = chat_mdl.chat(system="", history=[
{"role": "user", "content": "Hello!"}], gen_conf={})
if msg.find("ERROR: ") == 0:
logger.error(
logging.error(
"'{}' dosen't work. {}".format(
tenant["llm_id"],
msg))
embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"])
v, c = embd_mdl.encode(["Hello!"])
if c == 0:
logger.error(
logging.error(
"'{}' dosen't work!".format(
tenant["embd_id"]))
@ -172,7 +172,7 @@ def add_graph_templates():
except:
CanvasTemplateService.update_by_id(cnvs["id"], cnvs)
except Exception:
logger.exception("Add graph templates error: ")
logging.exception("Add graph templates error: ")
def init_web_data():
@ -183,7 +183,7 @@ def init_web_data():
# init_superuser()
add_graph_templates()
logger.info("init web data success:{}".format(time.time() - start_time))
logging.info("init web data success:{}".format(time.time() - start_time))
if __name__ == '__main__':

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import binascii
import os
import json
@ -31,7 +32,6 @@ from rag.app.resume import forbidden_select_fields4resume
from rag.nlp.search import index_name
from rag.utils import rmSpace, num_tokens_from_string, encoder
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class DialogService(CommonService):
@ -178,7 +178,7 @@ def chat(dialog, messages, stream=True, **kwargs):
tts_mdl = LLMBundle(dialog.tenant_id, LLMType.TTS)
# try to use sql if field mapping is good to go
if field_map:
logger.info("Use SQL to retrieval:{}".format(questions[-1]))
logging.debug("Use SQL to retrieval:{}".format(questions[-1]))
ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True))
if ans:
yield ans
@ -220,7 +220,7 @@ def chat(dialog, messages, stream=True, **kwargs):
doc_ids=attachments,
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
logger.info(
logging.debug(
"{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
retrieval_tm = timer()
@ -292,7 +292,7 @@ def chat(dialog, messages, stream=True, **kwargs):
yield decorate_answer(answer)
else:
answer = chat_mdl.chat(prompt, msg[1:], gen_conf)
logger.info("User: {}|Assistant: {}".format(
logging.debug("User: {}|Assistant: {}".format(
msg[-1]["content"], answer))
res = decorate_answer(answer)
res["audio_binary"] = tts(tts_mdl, answer)
@ -320,7 +320,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
nonlocal sys_prompt, user_promt, question, tried_times
sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {
"temperature": 0.06})
logger.info(f"{question} ==> {user_promt} get SQL: {sql}")
logging.debug(f"{question} ==> {user_promt} get SQL: {sql}")
sql = re.sub(r"[\r\n]+", " ", sql.lower())
sql = re.sub(r".*select ", "select ", sql.lower())
sql = re.sub(r" +", " ", sql)
@ -340,7 +340,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
flds.append(k)
sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:]
logger.info(f"{question} get SQL(refined): {sql}")
logging.debug(f"{question} get SQL(refined): {sql}")
tried_times += 1
return retrievaler.sql_retrieval(sql, format="json"), sql
@ -369,9 +369,9 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
question, sql, tbl["error"]
)
tbl, sql = get_table()
logger.info("TRY it again: {}".format(sql))
logging.debug("TRY it again: {}".format(sql))
logger.info("GET table: {}".format(tbl))
logging.debug("GET table: {}".format(tbl))
if tbl.get("error") or len(tbl["rows"]) == 0:
return None
@ -401,7 +401,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
if not docid_idx or not docnm_idx:
logger.warning("SQL missing field: " + sql)
logging.warning("SQL missing field: " + sql)
return {
"answer": "\n".join([clmns, line, rows]),
"reference": {"chunks": [], "doc_aggs": []},

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import hashlib
import json
import random
@ -39,7 +40,6 @@ from api.db.services.common_service import CommonService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db import StatusEnum
from rag.utils.redis_conn import REDIS_CONN
from api.utils.log_utils import logger
class DocumentService(CommonService):
@ -387,7 +387,7 @@ class DocumentService(CommonService):
cls.update_by_id(d["id"], info)
except Exception as e:
if str(e).find("'0'") < 0:
logger.exception("fetch task exception")
logging.exception("fetch task exception")
@classmethod
@DB.connection_context()
@ -544,7 +544,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
"knowledge_graph_kwd": "mind_map"
})
except Exception as e:
logger.exception("Mind map generation error")
logging.exception("Mind map generation error")
vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
assert len(cks) == len(vects)

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
import os
from concurrent.futures import ThreadPoolExecutor
@ -30,7 +31,6 @@ from api.db.services.file2document_service import File2DocumentService
from api.utils import get_uuid
from api.utils.file_utils import filename_type, thumbnail_img
from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.log_utils import logger
class FileService(CommonService):
@ -276,7 +276,7 @@ class FileService(CommonService):
return cls.model.delete().where((cls.model.tenant_id == user_id)
& (cls.model.id == folder_id)).execute(),
except Exception:
logger.exception("delete_folder_by_pf_id")
logging.exception("delete_folder_by_pf_id")
raise RuntimeError("Database error (File retrieval)!")
@classmethod
@ -325,7 +325,7 @@ class FileService(CommonService):
try:
cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
except Exception:
logger.exception("move_file")
logging.exception("move_file")
raise RuntimeError("Database error (File move)!")
@classmethod

View File

@ -13,13 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from api.db.services.user_service import TenantService
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel, TTSModel
from api.db import LLMType
from api.db.db_models import DB
from api.db.db_models import LLMFactories, LLM, TenantLLM
from api.db.services.common_service import CommonService
from api.utils.log_utils import logger
class LLMFactoriesService(CommonService):
@ -209,7 +209,7 @@ class LLMBundle(object):
emd, used_tokens = self.mdl.encode(texts, batch_size)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens):
logger.error(
logging.error(
"LLMBundle.encode can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens))
return emd, used_tokens
@ -217,7 +217,7 @@ class LLMBundle(object):
emd, used_tokens = self.mdl.encode_queries(query)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens):
logger.error(
logging.error(
"LLMBundle.encode_queries can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens))
return emd, used_tokens
@ -225,7 +225,7 @@ class LLMBundle(object):
sim, used_tokens = self.mdl.similarity(query, texts)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens):
logger.error(
logging.error(
"LLMBundle.similarity can't update token usage for {}/RERANK used_tokens: {}".format(self.tenant_id, used_tokens))
return sim, used_tokens
@ -233,7 +233,7 @@ class LLMBundle(object):
txt, used_tokens = self.mdl.describe(image, max_tokens)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens):
logger.error(
logging.error(
"LLMBundle.describe can't update token usage for {}/IMAGE2TEXT used_tokens: {}".format(self.tenant_id, used_tokens))
return txt
@ -241,7 +241,7 @@ class LLMBundle(object):
txt, used_tokens = self.mdl.transcription(audio)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens):
logger.error(
logging.error(
"LLMBundle.transcription can't update token usage for {}/SEQUENCE2TXT used_tokens: {}".format(self.tenant_id, used_tokens))
return txt
@ -250,7 +250,7 @@ class LLMBundle(object):
if isinstance(chunk,int):
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, chunk, self.llm_name):
logger.error(
logging.error(
"LLMBundle.tts can't update token usage for {}/TTS".format(self.tenant_id))
return
yield chunk
@ -259,7 +259,7 @@ class LLMBundle(object):
txt, used_tokens = self.mdl.chat(system, history, gen_conf)
if isinstance(txt, int) and not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens, self.llm_name):
logger.error(
logging.error(
"LLMBundle.chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.llm_name, used_tokens))
return txt
@ -268,7 +268,7 @@ class LLMBundle(object):
if isinstance(txt, int):
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, txt, self.llm_name):
logger.error(
logging.error(
"LLMBundle.chat_streamly can't update token usage for {}/CHAT llm_name: {}, content: {}".format(self.tenant_id, self.llm_name, txt))
return
yield txt

View File

@ -15,6 +15,17 @@
#
import logging
import inspect
from api.utils.log_utils import initRootLogger
initRootLogger(inspect.getfile(inspect.currentframe()))
for module in ["pdfminer"]:
module_logger = logging.getLogger(module)
module_logger.setLevel(logging.WARNING)
for module in ["peewee"]:
module_logger = logging.getLogger(module)
module_logger.handlers.clear()
module_logger.propagate = True
import os
import signal
import sys
@ -22,7 +33,6 @@ import time
import traceback
from concurrent.futures import ThreadPoolExecutor
import validation
from werkzeug.serving import run_simple
from api.apps import app
from api.db.runtime_config import RuntimeConfig
@ -31,7 +41,6 @@ from api.settings import (
HOST, HTTP_PORT
)
from api import utils
from api.utils.log_utils import logger
from api.db.db_models import init_database_tables as init_web_db
from api.db.init_data import init_web_data
@ -44,11 +53,11 @@ def update_progress():
try:
DocumentService.update_progress()
except Exception:
logger.exception("update_progress exception")
logging.exception("update_progress exception")
if __name__ == '__main__':
logger.info(r"""
logging.info(r"""
____ ___ ______ ______ __
/ __ \ / | / ____// ____// /____ _ __
/ /_/ // /| | / / __ / /_ / // __ \| | /| / /
@ -56,10 +65,10 @@ if __name__ == '__main__':
/_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/
""")
logger.info(
logging.info(
f'RAGFlow version: {RAGFLOW_VERSION_INFO}'
)
logger.info(
logging.info(
f'project base: {utils.file_utils.get_project_base_directory()}'
)
@ -83,26 +92,18 @@ if __name__ == '__main__':
RuntimeConfig.DEBUG = args.debug
if RuntimeConfig.DEBUG:
logger.info("run on debug mode")
logging.info("run on debug mode")
RuntimeConfig.init_env()
RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT)
peewee_logger = logging.getLogger("peewee")
peewee_logger.propagate = False
# rag_arch.common.log.ROpenHandler
peewee_logger.addHandler(logger.handlers[0])
peewee_logger.setLevel(logger.handlers[0].level)
thr = ThreadPoolExecutor(max_workers=1)
thr.submit(update_progress)
# start http server
try:
logger.info("RAG Flow http server start...")
werkzeug_logger = logging.getLogger("werkzeug")
for h in logger.handlers:
werkzeug_logger.addHandler(h)
logging.info("RAG Flow http server start...")
run_simple(
hostname=HOST,
port=HTTP_PORT,

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import functools
import json
import random
@ -40,7 +41,6 @@ from api.settings import (
from api.settings import RetCode
from api.utils import CustomJSONEncoder, get_uuid
from api.utils import json_dumps
from api.utils.log_utils import logger
requests.models.complexjson.dumps = functools.partial(
json.dumps, cls=CustomJSONEncoder)
@ -118,7 +118,7 @@ def get_data_error_result(code=RetCode.DATA_ERROR,
def server_error_response(e):
logger.exception(e)
logging.exception(e)
try:
if e.code == 401:
return get_json_result(code=401, message=repr(e))
@ -259,7 +259,7 @@ def construct_json_result(code=RetCode.SUCCESS, message='success', data=None):
def construct_error_response(e):
logger.exception(e)
logging.exception(e)
try:
if e.code == 401:
return construct_json_result(code=RetCode.UNAUTHORIZED, message=repr(e))

View File

@ -14,38 +14,41 @@
# limitations under the License.
#
import os
import os.path
import logging
from logging.handlers import RotatingFileHandler
from api.utils.file_utils import get_project_base_directory
def get_project_base_directory():
PROJECT_BASE = os.path.abspath(
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
os.pardir,
os.pardir,
)
)
return PROJECT_BASE
LOG_LEVEL = logging.INFO
LOG_FILE = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"ragflow_{os.getpid()}.log"))
LOG_FORMAT = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"
logger = None
def initRootLogger(script_path: str, log_level: int = logging.INFO, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"):
logger = logging.getLogger()
if logger.hasHandlers():
return
def getLogger():
global logger
if logger is not None:
return logger
script_name = os.path.basename(script_path)
log_path = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"{os.path.splitext(script_name)[0]}.log"))
print(f"log file path: {LOG_FILE}")
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
logger = logging.getLogger("ragflow")
logger.setLevel(LOG_LEVEL)
os.makedirs(os.path.dirname(log_path), exist_ok=True)
logger.setLevel(log_level)
formatter = logging.Formatter(log_format)
handler1 = RotatingFileHandler(LOG_FILE, maxBytes=10*1024*1024, backupCount=5)
handler1.setLevel(LOG_LEVEL)
formatter1 = logging.Formatter(LOG_FORMAT)
handler1.setFormatter(formatter1)
handler1 = RotatingFileHandler(log_path, maxBytes=10*1024*1024, backupCount=5)
handler1.setLevel(log_level)
handler1.setFormatter(formatter)
logger.addHandler(handler1)
handler2 = logging.StreamHandler()
handler2.setLevel(LOG_LEVEL)
formatter2 = logging.Formatter(LOG_FORMAT)
handler2.setFormatter(formatter2)
handler2.setLevel(log_level)
handler2.setFormatter(formatter)
logger.addHandler(handler2)
return logger
logger = getLogger()
msg = f"{script_name} log path: {log_path}"
logger.info(msg)

View File

@ -14,20 +14,20 @@
# limitations under the License.
#
import logging
import sys
from api.utils.log_utils import logger
def python_version_validation():
# Check python version
required_python_version = (3, 10)
if sys.version_info < required_python_version:
logger.info(
logging.info(
f"Required Python: >= {required_python_version[0]}.{required_python_version[1]}. Current Python version: {sys.version_info[0]}.{sys.version_info[1]}."
)
sys.exit(1)
else:
logger.info(f"Python version: {sys.version_info[0]}.{sys.version_info[1]}")
logging.info(f"Python version: {sys.version_info[0]}.{sys.version_info[1]}")
python_version_validation()

View File

@ -11,6 +11,7 @@
# limitations under the License.
#
import logging
import os
import random
@ -18,7 +19,6 @@ import xgboost as xgb
from io import BytesIO
import re
import pdfplumber
import logging
from PIL import Image
import numpy as np
from timeit import default_timer as timer
@ -26,15 +26,11 @@ from pypdf import PdfReader as pdf2_read
from api.settings import LIGHTEN
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import rag_tokenizer
from copy import deepcopy
from huggingface_hub import snapshot_download
logging.getLogger("pdfminer").setLevel(logging.WARNING)
class RAGFlowPdfParser:
def __init__(self):
self.ocr = OCR()
@ -51,7 +47,7 @@ class RAGFlowPdfParser:
if torch.cuda.is_available():
self.updown_cnt_mdl.set_param({"device": "cuda"})
except Exception:
logger.exception("RAGFlowPdfParser __init__")
logging.exception("RAGFlowPdfParser __init__")
try:
model_dir = os.path.join(
get_project_base_directory(),
@ -188,7 +184,7 @@ class RAGFlowPdfParser:
return True
def _table_transformer_job(self, ZM):
logger.info("Table processing...")
logging.debug("Table processing...")
imgs, pos = [], []
tbcnt = [0]
MARGIN = 10
@ -426,7 +422,7 @@ class RAGFlowPdfParser:
detach_feats = [b["x1"] < b_["x0"],
b["x0"] > b_["x1"]]
if (any(feats) and not any(concatting_feats)) or any(detach_feats):
logger.info("{} {} {} {}".format(
logging.debug("{} {} {} {}".format(
b["text"],
b_["text"],
any(feats),
@ -727,14 +723,14 @@ class RAGFlowPdfParser:
# continue
if tv < fv and tk:
tables[tk].insert(0, c)
logger.debug(
logging.debug(
"TABLE:" +
self.boxes[i]["text"] +
"; Cap: " +
tk)
elif fk:
figures[fk].insert(0, c)
logger.debug(
logging.debug(
"FIGURE:" +
self.boxes[i]["text"] +
"; Cap: " +
@ -761,7 +757,7 @@ class RAGFlowPdfParser:
if ii is not None:
b = louts[ii]
else:
logger.warn(
logging.warn(
f"Missing layout match: {pn + 1},%s" %
(bxs[0].get(
"layoutno", "")))
@ -919,7 +915,7 @@ class RAGFlowPdfParser:
if usefull(boxes[0]):
dfs(boxes[0], 0)
else:
logger.debug("WASTE: " + boxes[0]["text"])
logging.debug("WASTE: " + boxes[0]["text"])
except Exception:
pass
boxes.pop(0)
@ -928,7 +924,7 @@ class RAGFlowPdfParser:
res.append(
"\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
else:
logger.debug("REMOVED: " +
logging.debug("REMOVED: " +
"<<".join([c["text"] for c in lines]))
return "\n\n".join(res)
@ -940,7 +936,7 @@ class RAGFlowPdfParser:
fnm) if not binary else pdfplumber.open(BytesIO(binary))
return len(pdf.pages)
except Exception:
logger.exception("total_page_number")
logging.exception("total_page_number")
def __images__(self, fnm, zoomin=3, page_from=0,
page_to=299, callback=None):
@ -964,7 +960,7 @@ class RAGFlowPdfParser:
self.pdf.pages[page_from:page_to]]
self.total_page = len(self.pdf.pages)
except Exception:
logger.exception("RAGFlowPdfParser __images__")
logging.exception("RAGFlowPdfParser __images__")
self.outlines = []
try:
@ -980,11 +976,11 @@ class RAGFlowPdfParser:
dfs(outlines, 0)
except Exception as e:
logger.warning(f"Outlines exception: {e}")
logging.warning(f"Outlines exception: {e}")
if not self.outlines:
logger.warning("Miss outlines")
logging.warning("Miss outlines")
logger.info("Images converted.")
logging.debug("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
range(len(self.page_chars))]
@ -1024,7 +1020,7 @@ class RAGFlowPdfParser:
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
"".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
logger.info("Is it English:", self.is_english)
logging.debug("Is it English:", self.is_english)
self.page_cum_height = np.cumsum(self.page_cum_height)
assert len(self.page_cum_height) == len(self.page_images) + 1
@ -1164,9 +1160,9 @@ class PlainParser(object):
dfs(outlines, 0)
except Exception:
logger.exception("Outlines exception")
logging.exception("Outlines exception")
if not self.outlines:
logger.warning("Miss outlines")
logging.warning("Miss outlines")
return [(l, "") for l in lines], []

View File

@ -11,13 +11,13 @@
# limitations under the License.
#
import logging
import re
import json
import os
import pandas as pd
from rag.nlp import rag_tokenizer
from . import regions
from api.utils.log_utils import logger
current_file_path = os.path.dirname(os.path.abspath(__file__))
@ -71,7 +71,7 @@ GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
for c,v in CORP_TAG.items():
cc = corpNorm(rmNoise(c), False)
if not cc:
logger.info(c)
logging.debug(c)
CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
def is_good(nm):

View File

@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
import copy
import time
@ -23,7 +23,6 @@ from deepdoc.parser.resume.entities import degrees, schools, corporations
from rag.nlp import rag_tokenizer, surname
from xpinyin import Pinyin
from contextlib import contextmanager
from api.utils.log_utils import logger
class TimeoutException(Exception): pass
@ -164,7 +163,7 @@ def forEdu(cv):
y, m, d = getYMD(edu_end_dt)
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
except Exception as e:
logger.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
if sch:
cv["school_name_kwd"] = sch
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
@ -276,7 +275,7 @@ def forWork(cv):
try:
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
except Exception:
logger.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
if n.get("scale"):
r = re.search(r"^([0-9]+)", str(n["scale"]))
@ -333,7 +332,7 @@ def forWork(cv):
y, m, d = getYMD(work_st_tm)
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
except Exception as e:
logger.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
cv["job_num_int"] = 0
if duas:
@ -464,7 +463,7 @@ def parse(cv):
cv[f"{t}_kwd"] = nms
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
except Exception:
logger.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
cv[k] = []
# tokenize fields
@ -565,7 +564,7 @@ def parse(cv):
cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
except Exception as e:
logger.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
keys = list(cv.keys())
@ -580,7 +579,7 @@ def parse(cv):
cv["tob_resume_id"] = str(cv["tob_resume_id"])
cv["id"] = cv["tob_resume_id"]
logger.info("CCCCCCCCCCCCCCC")
logging.debug("CCCCCCCCCCCCCCC")
return dealWithInt64(cv)

View File

@ -14,13 +14,13 @@
# limitations under the License.
#
import logging
import sys
import six
import cv2
import numpy as np
import math
from PIL import Image
from api.utils.log_utils import logger
class DecodeImage(object):
@ -403,7 +403,7 @@ class DetResizeForTest(object):
return None, (None, None)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
except BaseException:
logger.exception("{} {} {}".format(img.shape, resize_w, resize_h))
logging.exception("{} {} {}".format(img.shape, resize_w, resize_h))
sys.exit(0)
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)

View File

@ -11,6 +11,7 @@
# limitations under the License.
#
import logging
import os
from copy import deepcopy
@ -19,7 +20,6 @@ from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory
from .operators import *
from api.utils.log_utils import logger
class Recognizer(object):
@ -440,7 +440,7 @@ class Recognizer(object):
end_index = min((i + 1) * batch_size, len(imgs))
batch_image_list = imgs[start_index:end_index]
inputs = self.preprocess(batch_image_list)
logger.info("preprocess")
logging.debug("preprocess")
for ins in inputs:
bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr)
res.append(bb)

View File

@ -11,10 +11,10 @@
# limitations under the License.
#
import logging
import os
import PIL
from PIL import ImageDraw
from api.utils.log_utils import logger
def save_results(image_list, results, labels, output_dir='output/', threshold=0.5):
@ -25,7 +25,7 @@ def save_results(image_list, results, labels, output_dir='output/', threshold=0.
out_path = os.path.join(output_dir, f"{idx}.jpg")
im.save(out_path, quality=95)
logger.info("save result to: " + out_path)
logging.debug("save result to: " + out_path)
def draw_box(im, result, lables, threshold=0.5):

View File

@ -10,9 +10,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import os
import sys
from api.utils.log_utils import logger
sys.path.insert(
0,
@ -59,7 +59,7 @@ def main(args):
} for t in lyt]
img = draw_box(images[i], lyt, labels, float(args.threshold))
img.save(outputs[i], quality=95)
logger.info("save result to: " + outputs[i])
logging.info("save result to: " + outputs[i])
def get_table_html(img, tb_cpns, ocr):

View File

@ -38,7 +38,7 @@ class TableStructureRecognizer(Recognizer):
super().__init__(self.labels, "tsr", os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"))
except Exception as e:
except Exception:
super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
local_dir_use_symlinks=False))

View File

@ -5,6 +5,7 @@ Reference:
- [graphrag](https://github.com/microsoft/graphrag)
"""
import logging
import argparse
import json
import re
@ -17,7 +18,6 @@ import tiktoken
from graphrag.claim_prompt import CLAIM_EXTRACTION_PROMPT, CONTINUE_PROMPT, LOOP_PROMPT
from rag.llm.chat_model import Base as CompletionLLM
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements
from api.utils.log_utils import logger
DEFAULT_TUPLE_DELIMITER = "<|>"
DEFAULT_RECORD_DELIMITER = "##"
@ -126,7 +126,7 @@ class ClaimExtractor:
]
source_doc_map[document_id] = text
except Exception as e:
logger.exception("error extracting claim")
logging.exception("error extracting claim")
self._on_error(
e,
traceback.format_exc(),
@ -265,4 +265,4 @@ if __name__ == "__main__":
"claim_description": ""
}
claim = ex(info)
logger.info(json.dumps(claim.output, ensure_ascii=False, indent=2))
logging.info(json.dumps(claim.output, ensure_ascii=False, indent=2))

View File

@ -5,6 +5,7 @@ Reference:
- [graphrag](https://github.com/microsoft/graphrag)
"""
import logging
import json
import re
import traceback
@ -19,7 +20,6 @@ from rag.llm.chat_model import Base as CompletionLLM
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements, dict_has_keys_with_types
from rag.utils import num_tokens_from_string
from timeit import default_timer as timer
from api.utils.log_utils import logger
@dataclass
@ -80,7 +80,7 @@ class CommunityReportsExtractor:
response = re.sub(r"[^\}]*$", "", response)
response = re.sub(r"\{\{", "{", response)
response = re.sub(r"\}\}", "}", response)
logger.info(response)
logging.debug(response)
response = json.loads(response)
if not dict_has_keys_with_types(response, [
("title", str),
@ -92,7 +92,7 @@ class CommunityReportsExtractor:
response["weight"] = weight
response["entities"] = ents
except Exception as e:
logger.exception("CommunityReportsExtractor got exception")
logging.exception("CommunityReportsExtractor got exception")
self._on_error(e, traceback.format_exc(), None)
continue

View File

@ -5,19 +5,11 @@ Reference:
- [graphrag](https://github.com/microsoft/graphrag)
"""
import argparse
import html
import json
import logging
import numbers
import re
import traceback
from collections.abc import Callable
from dataclasses import dataclass
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements
from rag.llm.chat_model import Base as CompletionLLM
import networkx as nx
from rag.utils import num_tokens_from_string

View File

@ -13,8 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import itertools
import logging
import itertools
import re
import traceback
from dataclasses import dataclass

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import os
from concurrent.futures import ThreadPoolExecutor
import json
@ -28,7 +29,6 @@ from graphrag.graph_extractor import GraphExtractor, DEFAULT_ENTITY_TYPES
from graphrag.mind_map_extractor import MindMapExtractor
from rag.nlp import rag_tokenizer
from rag.utils import num_tokens_from_string
from api.utils.log_utils import logger
def graph_merge(g1, g2):
@ -95,7 +95,7 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: List[str], callback, en
chunks = []
for n, attr in graph.nodes(data=True):
if attr.get("rank", 0) == 0:
logger.info(f"Ignore entity: {n}")
logging.debug(f"Ignore entity: {n}")
continue
chunk = {
"name_kwd": n,
@ -137,7 +137,7 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: List[str], callback, en
mg = mindmap(_chunks).output
if not len(mg.keys()): return chunks
logger.info(json.dumps(mg, ensure_ascii=False, indent=2))
logging.debug(json.dumps(mg, ensure_ascii=False, indent=2))
chunks.append(
{
"content_with_weight": json.dumps(mg, ensure_ascii=False, indent=2),

View File

@ -14,8 +14,6 @@ from graspologic.utils import largest_connected_component
import networkx as nx
from networkx import is_empty
log = logging.getLogger(__name__)
def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
"""Ensure an undirected graph with the same relationships will always be read the same way."""
@ -99,7 +97,7 @@ def run(graph: nx.Graph, args: dict[str, Any]) -> dict[int, dict[str, dict]]:
max_cluster_size = args.get("max_cluster_size", 12)
use_lcc = args.get("use_lcc", True)
if args.get("verbose", False):
log.info(
logging.debug(
"Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc
)
if not graph.nodes(): return {}

View File

@ -14,8 +14,8 @@
# limitations under the License.
#
import collections
import logging
import collections
import os
import re
import traceback
@ -29,7 +29,6 @@ from rag.llm.chat_model import Base as CompletionLLM
import markdown_to_json
from functools import reduce
from rag.utils import num_tokens_from_string
from api.utils.log_utils import logger
@dataclass
@ -193,6 +192,6 @@ class MindMapExtractor:
gen_conf = {"temperature": 0.5}
response = self._llm.chat(text, [{"role": "user", "content": "Output:"}], gen_conf)
response = re.sub(r"```[^\n]*", "", response)
logger.info(response)
logger.info(self._todict(markdown_to_json.dictify(response)))
logging.debug(response)
logging.debug(self._todict(markdown_to_json.dictify(response)))
return self._todict(markdown_to_json.dictify(response))

View File

@ -1,8 +1,8 @@
import logging
import requests
from bridge.context import ContextType # Import Context, ContextType
from bridge.reply import Reply, ReplyType # Import Reply, ReplyType
from bridge import *
from api.utils.log_utils import logger
from plugins import Plugin, register # Import Plugin and register
from plugins.event import Event, EventContext, EventAction # Import event-related classes
@ -16,7 +16,7 @@ class RAGFlowChat(Plugin):
self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
# Store conversation_id for each user
self.conversations = {}
logger.info("[RAGFlowChat] Plugin initialized")
logging.info("[RAGFlowChat] Plugin initialized")
def on_handle_context(self, e_context: EventContext):
context = e_context['context']
@ -45,7 +45,7 @@ class RAGFlowChat(Plugin):
user_id = session_id # Use session_id as user_id
if not api_key or not host_address:
logger.error("[RAGFlowChat] Missing configuration")
logging.error("[RAGFlowChat] Missing configuration")
return "The plugin configuration is incomplete. Please check the configuration."
headers = {
@ -63,20 +63,20 @@ class RAGFlowChat(Plugin):
}
try:
response = requests.get(url_new_conversation, headers=headers, params=params_new_conversation)
logger.debug(f"[RAGFlowChat] New conversation response: {response.text}")
logging.debug(f"[RAGFlowChat] New conversation response: {response.text}")
if response.status_code == 200:
data = response.json()
if data.get("code") == 0:
conversation_id = data["data"]["id"]
self.conversations[user_id] = conversation_id
else:
logger.error(f"[RAGFlowChat] Failed to create conversation: {data.get('message')}")
logging.error(f"[RAGFlowChat] Failed to create conversation: {data.get('message')}")
return f"Sorry, unable to create a conversation: {data.get('message')}"
else:
logger.error(f"[RAGFlowChat] HTTP error when creating conversation: {response.status_code}")
logging.error(f"[RAGFlowChat] HTTP error when creating conversation: {response.status_code}")
return f"Sorry, unable to connect to RAGFlow API (create conversation). HTTP status code: {response.status_code}"
except Exception as e:
logger.exception("[RAGFlowChat] Exception when creating conversation")
logging.exception("[RAGFlowChat] Exception when creating conversation")
return f"Sorry, an internal error occurred: {str(e)}"
# Step 2: Send the message and get a reply
@ -95,18 +95,18 @@ class RAGFlowChat(Plugin):
try:
response = requests.post(url_completion, headers=headers, json=payload_completion)
logger.debug(f"[RAGFlowChat] Completion response: {response.text}")
logging.debug(f"[RAGFlowChat] Completion response: {response.text}")
if response.status_code == 200:
data = response.json()
if data.get("code") == 0:
answer = data["data"]["answer"]
return answer
else:
logger.error(f"[RAGFlowChat] Failed to get answer: {data.get('message')}")
logging.error(f"[RAGFlowChat] Failed to get answer: {data.get('message')}")
return f"Sorry, unable to get a reply: {data.get('message')}"
else:
logger.error(f"[RAGFlowChat] HTTP error when getting answer: {response.status_code}")
logging.error(f"[RAGFlowChat] HTTP error when getting answer: {response.status_code}")
return f"Sorry, unable to connect to RAGFlow API (get reply). HTTP status code: {response.status_code}"
except Exception as e:
logger.exception("[RAGFlowChat] Exception when getting answer")
logging.exception("[RAGFlowChat] Exception when getting answer")
return f"Sorry, an internal error occurred: {str(e)}"

View File

@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
import re
from io import BytesIO
@ -20,7 +21,6 @@ from rag.nlp import bullets_category, is_english,remove_contents_table, \
tokenize_chunks
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -39,7 +39,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis finished")
logger.info("layouts: {}".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
self._text_merge()

View File

@ -11,6 +11,7 @@
# limitations under the License.
#
import logging
from email import policy
from email.parser import BytesParser
from rag.app.naive import chunk as naive_chunk
@ -18,7 +19,6 @@ import re
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
from deepdoc.parser import HtmlParser, TxtParser
from timeit import default_timer as timer
from api.utils.log_utils import logger
import io
@ -86,7 +86,7 @@ def chunk(
)
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
logger.info("naive_merge({}): {}".format(filename, timer() - st))
logging.debug("naive_merge({}): {}".format(filename, timer() - st))
# get the attachment info
for part in msg.iter_attachments():
content_disposition = part.get("Content-Disposition")

View File

@ -21,7 +21,6 @@ from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge,
make_colon_as_title, tokenize_chunks, docx_question_level
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
from api.utils.log_utils import logger
class Docx(DocxParser):
@ -122,7 +121,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis finished")
logger.info("layouts:".format(
logging.debug("layouts:".format(
))
self._naive_vertical_merge()

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
import logging
import copy
import re
@ -24,7 +25,6 @@ from rag.utils import num_tokens_from_string
from deepdoc.parser import PdfParser, PlainParser, DocxParser
from docx import Document
from PIL import Image
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -48,11 +48,11 @@ class Pdf(PdfParser):
# for bb in self.boxes:
# for b in bb:
# print(b)
logger.info("OCR: {}".format(timer() - start))
logging.debug("OCR: {}".format(timer() - start))
self._layouts_rec(zoomin)
callback(0.65, "Layout analysis finished.")
logger.info("layouts: {}".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))
self._table_transformer_job(zoomin)
callback(0.67, "Table analysis finished.")
self._text_merge()

View File

@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
from io import BytesIO
from docx import Document
@ -19,7 +20,6 @@ from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from api.utils.log_utils import logger
from rag.utils import num_tokens_from_string
from PIL import Image
from functools import reduce
@ -41,13 +41,13 @@ class Docx(DocxParser):
try:
image_blob = related_part.image.blob
except UnrecognizedImageError:
logger.info("Unrecognized image format. Skipping image.")
logging.info("Unrecognized image format. Skipping image.")
return None
except UnexpectedEndOfFileError:
logger.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
return None
except InvalidImageStreamError:
logger.info("The recognized image stream appears to be corrupted. Skipping image.")
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
return None
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
@ -133,7 +133,7 @@ class Pdf(PdfParser):
callback
)
callback(msg="OCR finished")
logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin)
@ -147,7 +147,7 @@ class Pdf(PdfParser):
self._concat_downward()
# self._filter_forpages()
logger.info("layouts cost: {}s".format(timer() - start))
logging.info("layouts cost: {}s".format(timer() - start))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], tbls
@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return chunks
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
logger.info("naive_merge({}): {}".format(filename, timer() - st))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -280,7 +280,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return chunks
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
logger.info("naive_merge({}): {}".format(filename, timer() - st))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res

View File

@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
from io import BytesIO
import re
@ -18,7 +19,6 @@ from deepdoc.parser.utils import get_text
from rag.app import laws
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -38,7 +38,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis finished.")
logger.info("layouts cost: {}s".format(timer() - start))
logging.debug("layouts cost: {}s".format(timer() - start))
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis finished.")
self._text_merge()

View File

@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import copy
import re
@ -17,7 +18,6 @@ from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
import numpy as np
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -41,7 +41,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis finished")
logger.info(f"layouts cost: {timer() - start}s")
logging.debug(f"layouts cost: {timer() - start}s")
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
self._text_merge()
@ -53,7 +53,7 @@ class Pdf(PdfParser):
# clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2:
logger.info("two_column................... {} {}".format(column_width,
logging.debug("two_column................... {} {}".format(column_width,
self.page_images[0].size[0] / zoomin / 2))
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
for b in self.boxes:
@ -115,8 +115,8 @@ class Pdf(PdfParser):
from_page, min(
to_page, self.total_page)))
for b in self.boxes:
logger.info("{} {}".format(b["text"], b.get("layoutno")))
logger.info("{}".format(tbls))
logging.debug("{} {}".format(b["text"], b.get("layoutno")))
logging.debug("{}".format(tbls))
return {
"title": title,
@ -157,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
logger.info("It's English.....{}".format(eng))
logging.debug("It's English.....{}".format(eng))
res = tokenize_table(paper["tables"], doc, eng)
@ -184,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
chunks = []
last_sid = -2

View File

@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
from copy import deepcopy
from io import BytesIO
@ -19,7 +20,6 @@ from openpyxl import load_workbook
from deepdoc.parser.utils import get_text
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
from api.utils.log_utils import logger
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document
from PIL import Image
@ -82,7 +82,7 @@ class Pdf(PdfParser):
callback
)
callback(msg="OCR finished")
logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis finished.")
@ -94,7 +94,7 @@ class Pdf(PdfParser):
#self._naive_vertical_merge()
# self._concat_downward()
#self._filter_forpages()
logger.info("layouts: {}".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))
sections = [b["text"] for b in self.boxes]
bull_x0_list = []
q_bull, reg = qbullets_category(sections)

View File

@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import base64
import datetime
import json
@ -20,7 +21,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import rag_tokenizer
from deepdoc.parser.resume import refactor
from deepdoc.parser.resume import step_one, step_two
from api.utils.log_utils import logger
from rag.utils import rmSpace
forbidden_select_fields4resume = [
@ -64,7 +64,7 @@ def remote_call(filename, binary):
resume = step_two.parse(resume)
return resume
except Exception:
logger.exception("Resume parser error")
logging.exception("Resume parser error")
return {}
@ -86,7 +86,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
callback(-1, "Resume is not successfully parsed.")
raise Exception("Resume parser remote call fail!")
callback(0.6, "Done parsing. Chunking...")
logger.info("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2))
logging.debug("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2))
field_map = {
"name_kwd": "姓名/名字",
@ -158,7 +158,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
doc[n] = resume[n]
logger.info("chunked resume to " + str(doc))
logging.debug("chunked resume to " + str(doc))
KnowledgebaseService.update_parser_config(
kwargs["kb_id"], {"field_map": field_map})
return [doc]

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
from typing import Optional
import threading
@ -32,7 +33,6 @@ from api.utils.file_utils import get_home_cache_dir
from rag.utils import num_tokens_from_string, truncate
import google.generativeai as genai
import json
from api.utils.log_utils import logger
class Base(ABC):
def __init__(self, key, model_name):
@ -297,7 +297,7 @@ class YoudaoEmbed(Base):
if not LIGHTEN and not YoudaoEmbed._client:
from BCEmbedding import EmbeddingModel as qanthing
try:
logger.info("LOADING BCE...")
logging.info("LOADING BCE...")
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
get_home_cache_dir(),
"bce-embedding-base_v1"))

View File

@ -27,7 +27,6 @@ from api.settings import LIGHTEN
from api.utils.file_utils import get_home_cache_dir
from rag.utils import num_tokens_from_string, truncate
import json
from api.utils.log_utils import logger
def sigmoid(x):
@ -127,7 +126,7 @@ class YoudaoRerank(DefaultRerank):
with YoudaoRerank._model_lock:
if not YoudaoRerank._model:
try:
logger.info("LOADING BCE...")
logging.info("LOADING BCE...")
YoudaoRerank._model = RerankerModel(model_name_or_path=os.path.join(
get_home_cache_dir(),
re.sub(r"^[a-zA-Z0-9]+/", "", model_name)))

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
import logging
import random
from collections import Counter
@ -26,7 +27,6 @@ from word2number import w2n
from cn2an import cn2an
from PIL import Image
import json
from api.utils.log_utils import logger
all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@ -236,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
# wrap up as es documents
for ck in chunks:
if len(ck.strip()) == 0:continue
logger.debug("-- {}".format(ck))
logging.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
if pdf_parser:
try:
@ -255,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
# wrap up as es documents
for ck, image in zip(chunks, images):
if len(ck.strip()) == 0:continue
logger.debug("-- {}".format(ck))
logging.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
d["image"] = image
tokenize(d, ck, eng)
@ -458,7 +458,7 @@ def hierarchical_merge(bull, sections, depth):
for i in range(len(cks)):
cks[i] = [sections[j] for j in cks[i][::-1]]
logger.info("\n* ".join(cks[i]))
logging.debug("\n* ".join(cks[i]))
res = [[]]
num = [0]

View File

@ -14,9 +14,9 @@
# limitations under the License.
#
import logging
import json
import re
import logging
from rag.utils.doc_store_conn import MatchTextExpr
from rag.nlp import rag_tokenizer, term_weight, synonym
@ -88,7 +88,7 @@ class FulltextQueryer:
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
syns.append(" ".join(syn))
q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
q = ["({}^{:.4f}".format(tk, w) + " %s)".format() for (tk, w), syn in zip(tks_w, syns)]
for i in range(1, len(tks_w)):
q.append(
'"%s %s"^%.4f'
@ -121,7 +121,7 @@ class FulltextQueryer:
twts = self.tw.weights([tt])
syns = self.syn.lookup(tt)
if syns: keywords.extend(syns)
logging.info(json.dumps(twts, ensure_ascii=False))
logging.debug(json.dumps(twts, ensure_ascii=False))
tms = []
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
sm = (

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
import logging
import copy
import datrie
import math
@ -25,7 +26,6 @@ from hanziconv import HanziConv
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class RagTokenizer:
@ -36,7 +36,7 @@ class RagTokenizer:
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
def loadDict_(self, fnm):
logger.info(f"[HUQIE]:Build trie {fnm}")
logging.info(f"[HUQIE]:Build trie {fnm}")
try:
of = open(fnm, "r", encoding='utf-8')
while True:
@ -53,7 +53,7 @@ class RagTokenizer:
self.trie_.save(fnm + ".trie")
of.close()
except Exception:
logger.exception(f"[HUQIE]:Build trie {fnm} failed")
logging.exception(f"[HUQIE]:Build trie {fnm} failed")
def __init__(self, debug=False):
self.DEBUG = debug
@ -69,7 +69,7 @@ class RagTokenizer:
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
return
except Exception:
logger.exception("[HUQIE]:Build default trie")
logging.exception("[HUQIE]:Build default trie")
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(self.DIR_ + ".txt")
@ -173,7 +173,7 @@ class RagTokenizer:
tks.append(tk)
F /= len(tks)
L /= len(tks)
logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
logging.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
return tks, B / len(tks) + L + F
def sortTks_(self, tkslist):
@ -277,8 +277,8 @@ class RagTokenizer:
tks, s = self.maxForward_(L)
tks1, s1 = self.maxBackward_(L)
if self.DEBUG:
logger.debug("[FW] {} {}".format(tks, s))
logger.debug("[BW] {} {}".format(tks1, s1))
logging.debug("[FW] {} {}".format(tks, s))
logging.debug("[BW] {} {}".format(tks1, s1))
i, j, _i, _j = 0, 0, 0, 0
same = 0
@ -325,7 +325,7 @@ class RagTokenizer:
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
res = " ".join(self.english_normalize_(res))
logger.debug("[TKS] {}".format(self.merge_(res)))
logging.debug("[TKS] {}".format(self.merge_(res)))
return self.merge_(res)
def fine_grained_tokenize(self, tks):
@ -416,30 +416,30 @@ if __name__ == '__main__':
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
tks = tknzr.tokenize(
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("虽然我不怎么玩")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
if len(sys.argv) < 2:
sys.exit()
tknzr.DEBUG = False
@ -449,5 +449,5 @@ if __name__ == '__main__':
line = of.readline()
if not line:
break
logger.info(tknzr.tokenize(line))
logging.info(tknzr.tokenize(line))
of.close()

View File

@ -14,12 +14,12 @@
# limitations under the License.
#
import logging
import re
import json
from typing import List, Optional, Dict, Union
from dataclasses import dataclass
from api.utils.log_utils import logger
from rag.utils import rmSpace
from rag.nlp import rag_tokenizer, query
import numpy as np
@ -83,7 +83,7 @@ class Dealer:
orderBy.desc("create_timestamp_flt")
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
logger.info("Dealer.search TOTAL: {}".format(total))
logging.debug("Dealer.search TOTAL: {}".format(total))
else:
highlightFields = ["content_ltks", "title_tks"] if highlight else []
matchText, keywords = self.qryr.question(qst, min_match=0.3)
@ -91,7 +91,7 @@ class Dealer:
matchExprs = [matchText]
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
logger.info("Dealer.search TOTAL: {}".format(total))
logging.debug("Dealer.search TOTAL: {}".format(total))
else:
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
q_vec = matchDense.embedding_data
@ -102,7 +102,7 @@ class Dealer:
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
logger.info("Dealer.search TOTAL: {}".format(total))
logging.debug("Dealer.search TOTAL: {}".format(total))
# If result is empty, try again with lower min_match
if total == 0:
@ -112,7 +112,7 @@ class Dealer:
matchDense.extra_options["similarity"] = 0.17
res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
logger.info("Dealer.search 2 TOTAL: {}".format(total))
logging.debug("Dealer.search 2 TOTAL: {}".format(total))
for k in keywords:
kwds.add(k)
@ -123,7 +123,7 @@ class Dealer:
continue
kwds.add(kk)
logger.info(f"TOTAL: {total}")
logging.debug(f"TOTAL: {total}")
ids=self.dataStore.getChunkIds(res)
keywords=list(kwds)
highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
@ -180,7 +180,7 @@ class Dealer:
continue
idx.append(i)
pieces_.append(t)
logger.info("{} => {}".format(answer, pieces_))
logging.debug("{} => {}".format(answer, pieces_))
if not pieces_:
return answer, set([])
@ -201,7 +201,7 @@ class Dealer:
chunks_tks,
tkweight, vtweight)
mx = np.max(sim) * 0.99
logger.info("{} SIM: {}".format(pieces_[i], mx))
logging.debug("{} SIM: {}".format(pieces_[i], mx))
if mx < thr:
continue
cites[idx[i]] = list(

View File

@ -14,13 +14,13 @@
# limitations under the License.
#
import logging
import json
import os
import time
import re
from nltk.corpus import wordnet
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class Dealer:
@ -33,14 +33,14 @@ class Dealer:
try:
self.dictionary = json.load(open(path, 'r'))
except Exception:
logger.warn("Missing synonym.json")
logging.warn("Missing synonym.json")
self.dictionary = {}
if not redis:
logger.warning(
logging.warning(
"Realtime synonym is disabled, since no redis connection.")
if not len(self.dictionary.keys()):
logger.warning("Fail to load synonym")
logging.warning("Fail to load synonym")
self.redis = redis
self.load()
@ -64,7 +64,7 @@ class Dealer:
d = json.loads(d)
self.dictionary = d
except Exception as e:
logger.error("Fail to load synonym!" + str(e))
logging.error("Fail to load synonym!" + str(e))
def lookup(self, tk):
if re.match(r"[a-z]+$", tk):

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
import logging
import math
import json
import re
@ -21,7 +22,6 @@ import os
import numpy as np
from rag.nlp import rag_tokenizer
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class Dealer:
@ -83,11 +83,11 @@ class Dealer:
try:
self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
except Exception:
logger.warning("Load ner.json FAIL!")
logging.warning("Load ner.json FAIL!")
try:
self.df = load_dict(os.path.join(fnm, "term.freq"))
except Exception:
logger.warning("Load term.freq FAIL!")
logging.warning("Load term.freq FAIL!")
def pretoken(self, txt, num=False, stpwd=True):
patt = [

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
from concurrent.futures import ThreadPoolExecutor, ALL_COMPLETED, wait
from threading import Lock
@ -22,7 +23,6 @@ import numpy as np
from sklearn.mixture import GaussianMixture
from rag.utils import truncate
from api.utils.log_utils import logger
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
@ -62,13 +62,13 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
{"temperature": 0.3, "max_tokens": self._max_token}
)
cnt = re.sub("(······\n由于长度的原因,回答被截断了,要继续吗?|For the content length reason, it stopped, continue?)", "", cnt)
logger.info(f"SUM: {cnt}")
logging.debug(f"SUM: {cnt}")
embds, _ = self._embd_model.encode([cnt])
with lock:
if not len(embds[0]): return
chunks.append((cnt, embds[0]))
except Exception as e:
logger.exception("summarize got exception")
logging.exception("summarize got exception")
return e
labels = []
@ -104,7 +104,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
ck_idx = [i+start for i in range(len(lbls)) if lbls[i] == c]
threads.append(executor.submit(summarize, ck_idx, lock))
wait(threads, return_when=ALL_COMPLETED)
logger.info(str([t.result() for t in threads]))
logging.debug(str([t.result() for t in threads]))
assert len(chunks) - end == n_clusters, "{} vs. {}".format(len(chunks) - end, n_clusters)
labels.extend(lbls)

View File

@ -13,19 +13,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import time
import traceback
from api.db.db_models import close_connection
from api.db.services.task_service import TaskService
from api.utils.log_utils import logger
from rag.utils.storage_factory import STORAGE_IMPL
from rag.utils.redis_conn import REDIS_CONN
def collect():
doc_locations = TaskService.get_ongoing_doc_name()
logger.info(doc_locations)
logging.debug(doc_locations)
if len(doc_locations) == 0:
time.sleep(1)
return
@ -34,7 +34,7 @@ def collect():
def main():
locations = collect()
if not locations:return
logger.info(f"TASKS: {len(locations)}")
logging.info(f"TASKS: {len(locations)}")
for kb_id, loc in locations:
try:
if REDIS_CONN.is_alive():
@ -43,7 +43,7 @@ def main():
if REDIS_CONN.exist(key):continue
file_bin = STORAGE_IMPL.get(kb_id, loc)
REDIS_CONN.transaction(key, file_bin, 12 * 60)
logger.info("CACHE: {}".format(loc))
logging.info("CACHE: {}".format(loc))
except Exception as e:
traceback.print_stack(e)
except Exception as e:

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import discord
import requests
import base64
import asyncio
from api.utils.log_utils import logger
URL = '{YOUR_IP_ADDRESS:PORT}/v1/api/completion_aibotk' # Default: https://demo.ragflow.io/v1/api/completion_aibotk
@ -37,7 +37,7 @@ client = discord.Client(intents=intents)
@client.event
async def on_ready():
logger.info(f'We have logged in as {client.user}')
logging.info(f'We have logged in as {client.user}')
@client.event

View File

@ -13,9 +13,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import inspect
from api.utils.log_utils import initRootLogger
initRootLogger(inspect.getfile(inspect.currentframe()))
for module in ["pdfminer"]:
module_logger = logging.getLogger(module)
module_logger.setLevel(logging.WARNING)
for module in ["peewee"]:
module_logger = logging.getLogger(module)
module_logger.handlers.clear()
module_logger.propagate = True
import datetime
import json
import logging
import os
import hashlib
import copy
@ -42,7 +53,6 @@ from api.db.db_models import close_connection
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
from rag.nlp import search, rag_tokenizer
from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
from api.utils.log_utils import logger, LOG_FILE
from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME
from rag.utils import rmSpace, num_tokens_from_string
from rag.utils.redis_conn import REDIS_CONN, Payload
@ -90,7 +100,7 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing...
try:
TaskService.update_progress(task_id, d)
except Exception:
logger.exception(f"set_progress({task_id}) got exception")
logging.exception(f"set_progress({task_id}) got exception")
close_connection()
if cancel:
@ -110,7 +120,7 @@ def collect():
time.sleep(1)
return pd.DataFrame()
except Exception:
logger.exception("Get task event from queue exception")
logging.exception("Get task event from queue exception")
return pd.DataFrame()
msg = PAYLOAD.get_message()
@ -118,11 +128,11 @@ def collect():
return pd.DataFrame()
if TaskService.do_cancel(msg["id"]):
logger.info("Task {} has been canceled.".format(msg["id"]))
logging.info("Task {} has been canceled.".format(msg["id"]))
return pd.DataFrame()
tasks = TaskService.get_tasks(msg["id"])
if not tasks:
logger.warning("{} empty task!".format(msg["id"]))
logging.warning("{} empty task!".format(msg["id"]))
return []
tasks = pd.DataFrame(tasks)
@ -151,29 +161,29 @@ def build(row):
st = timer()
bucket, name = File2DocumentService.get_storage_address(doc_id=row["doc_id"])
binary = get_storage_binary(bucket, name)
logger.info(
logging.info(
"From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
except TimeoutError:
callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.")
logger.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"]))
logging.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"]))
return
except Exception as e:
if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"])
else:
callback(-1, "Get file from minio: %s" % str(e).replace("'", ""))
logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
logging.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
return
try:
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
to_page=row["to_page"], lang=row["language"], callback=callback,
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
logger.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"]))
logging.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"]))
except Exception as e:
callback(-1, "Internal server error while chunking: %s" %
str(e).replace("'", ""))
logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
logging.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
return
docs = []
@ -210,12 +220,12 @@ def build(row):
STORAGE_IMPL.put(row["kb_id"], d["id"], output_buffer.getvalue())
el += timer() - st
except Exception:
logger.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"]))
logging.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"]))
d["img_id"] = "{}-{}".format(row["kb_id"], d["id"])
del d["image"]
docs.append(d)
logger.info("MINIO PUT({}):{}".format(row["name"], el))
logging.info("MINIO PUT({}):{}".format(row["name"], el))
if row["parser_config"].get("auto_keywords", 0):
st = timer()
@ -345,7 +355,7 @@ def main():
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
except Exception as e:
callback(-1, msg=str(e))
logger.exception("LLMBundle got exception")
logging.exception("LLMBundle got exception")
continue
if r.get("task_type", "") == "raptor":
@ -354,12 +364,12 @@ def main():
cks, tk_count, vector_size = run_raptor(r, chat_mdl, embd_mdl, callback)
except Exception as e:
callback(-1, msg=str(e))
logger.exception("run_raptor got exception")
logging.exception("run_raptor got exception")
continue
else:
st = timer()
cks = build(r)
logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
logging.info("Build chunks({}): {}".format(r["name"], timer() - st))
if cks is None:
continue
if not cks:
@ -375,12 +385,12 @@ def main():
tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
except Exception as e:
callback(-1, "Embedding error:{}".format(str(e)))
logger.exception("run_rembedding got exception")
logging.exception("run_rembedding got exception")
tk_count = 0
logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st))
# logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
# logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
init_kb(r, vector_size)
chunk_count = len(set([c["id"] for c in cks]))
st = timer()
@ -391,11 +401,11 @@ def main():
if b % 128 == 0:
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
if es_r:
callback(-1, f"Insert chunk error, detail info please check {LOG_FILE}. Please also check ES status!")
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
logger.error('Insert chunk error: ' + str(es_r))
logging.error('Insert chunk error: ' + str(es_r))
else:
if TaskService.do_cancel(r["id"]):
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
@ -404,7 +414,7 @@ def main():
callback(1., "Done!")
DocumentService.increment_chunk_num(
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
logger.info(
logging.info(
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
r["id"], tk_count, len(cks), timer() - st))
@ -421,16 +431,11 @@ def report_status():
obj[CONSUMER_NAME] = obj[CONSUMER_NAME][-60:]
REDIS_CONN.set_obj("TASKEXE", obj, 60*2)
except Exception:
logger.exception("report_status got exception")
logging.exception("report_status got exception")
time.sleep(30)
if __name__ == "__main__":
peewee_logger = logging.getLogger('peewee')
peewee_logger.propagate = False
peewee_logger.addHandler(logger.handlers[0])
peewee_logger.setLevel(logger.handlers[0].level)
exe = ThreadPoolExecutor(max_workers=1)
exe.submit(report_status)

View File

@ -1,3 +1,4 @@
import logging
import os
import time
from io import BytesIO
@ -24,7 +25,7 @@ class RAGFlowAzureSasBlob(object):
try:
self.conn = ContainerClient.from_container_url(self.container_url + "?" + self.sas_token)
except Exception:
logger.exception("Fail to connect %s " % self.container_url)
logging.exception("Fail to connect %s " % self.container_url)
def __close__(self):
del self.conn
@ -39,7 +40,7 @@ class RAGFlowAzureSasBlob(object):
try:
return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
logging.exception(f"Fail put {bucket}/{fnm}")
self.__open__()
time.sleep(1)
@ -47,7 +48,7 @@ class RAGFlowAzureSasBlob(object):
try:
self.conn.delete_blob(fnm)
except Exception:
logger.exception(f"Fail rm {bucket}/{fnm}")
logging.exception(f"Fail rm {bucket}/{fnm}")
def get(self, bucket, fnm):
for _ in range(1):
@ -55,7 +56,7 @@ class RAGFlowAzureSasBlob(object):
r = self.conn.download_blob(fnm)
return r.read()
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
logging.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return
@ -64,7 +65,7 @@ class RAGFlowAzureSasBlob(object):
try:
return self.conn.get_blob_client(fnm).exists()
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
logging.exception(f"Fail put {bucket}/{fnm}")
return False
def get_presigned_url(self, bucket, fnm, expires):
@ -72,7 +73,7 @@ class RAGFlowAzureSasBlob(object):
try:
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
logging.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return

View File

@ -1,3 +1,4 @@
import logging
import os
import time
from rag import settings
@ -28,7 +29,7 @@ class RAGFlowAzureSpnBlob(object):
credentials = ClientSecretCredential(tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.secret, authority=AzureAuthorityHosts.AZURE_CHINA)
self.conn = FileSystemClient(account_url=self.account_url, file_system_name=self.container_name, credential=credentials)
except Exception:
logger.exception("Fail to connect %s" % self.account_url)
logging.exception("Fail to connect %s" % self.account_url)
def __close__(self):
del self.conn
@ -47,7 +48,7 @@ class RAGFlowAzureSpnBlob(object):
f.append_data(binary, offset=0, length=len(binary))
return f.flush_data(len(binary))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
logging.exception(f"Fail put {bucket}/{fnm}")
self.__open__()
time.sleep(1)
@ -55,7 +56,7 @@ class RAGFlowAzureSpnBlob(object):
try:
self.conn.delete_file(fnm)
except Exception:
logger.exception(f"Fail rm {bucket}/{fnm}")
logging.exception(f"Fail rm {bucket}/{fnm}")
def get(self, bucket, fnm):
for _ in range(1):
@ -64,7 +65,7 @@ class RAGFlowAzureSpnBlob(object):
r = client.download_file()
return r.read()
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
logging.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return
@ -74,7 +75,7 @@ class RAGFlowAzureSpnBlob(object):
client = self.conn.get_file_client(fnm)
return client.exists()
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
logging.exception(f"Fail put {bucket}/{fnm}")
return False
def get_presigned_url(self, bucket, fnm, expires):
@ -82,7 +83,7 @@ class RAGFlowAzureSpnBlob(object):
try:
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
logging.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return

View File

@ -1,3 +1,4 @@
import logging
import re
import json
import time
@ -8,7 +9,6 @@ import copy
from elasticsearch import Elasticsearch
from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
from elastic_transport import ConnectionTimeout
from api.utils.log_utils import logger
from rag import settings
from rag.utils import singleton
from api.utils.file_utils import get_project_base_directory
@ -22,7 +22,7 @@ from rag.nlp import is_english, rag_tokenizer
class ESConnection(DocStoreConnection):
def __init__(self):
self.info = {}
logger.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.")
logging.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.")
for _ in range(24):
try:
self.es = Elasticsearch(
@ -36,25 +36,25 @@ class ESConnection(DocStoreConnection):
self.info = self.es.info()
break
except Exception as e:
logger.warn(f"{str(e)}. Waiting Elasticsearch {settings.ES['hosts']} to be healthy.")
logging.warn(f"{str(e)}. Waiting Elasticsearch {settings.ES['hosts']} to be healthy.")
time.sleep(5)
if not self.es.ping():
msg = f"Elasticsearch {settings.ES['hosts']} didn't become healthy in 120s."
logger.error(msg)
logging.error(msg)
raise Exception(msg)
v = self.info.get("version", {"number": "8.11.3"})
v = v["number"].split(".")[0]
if int(v) < 8:
msg = f"Elasticsearch version must be greater than or equal to 8, current version: {v}"
logger.error(msg)
logging.error(msg)
raise Exception(msg)
fp_mapping = os.path.join(get_project_base_directory(), "conf", "mapping.json")
if not os.path.exists(fp_mapping):
msg = f"Elasticsearch mapping file not found at {fp_mapping}"
logger.error(msg)
logging.error(msg)
raise Exception(msg)
self.mapping = json.load(open(fp_mapping, "r"))
logger.info(f"Elasticsearch {settings.ES['hosts']} is healthy.")
logging.info(f"Elasticsearch {settings.ES['hosts']} is healthy.")
"""
Database operations
@ -79,13 +79,13 @@ class ESConnection(DocStoreConnection):
settings=self.mapping["settings"],
mappings=self.mapping["mappings"])
except Exception:
logger.exception("ES create index error %s" % (indexName))
logging.exception("ES create index error %s" % (indexName))
def deleteIdx(self, indexName: str, knowledgebaseId: str):
try:
return self.es.indices.delete(indexName, allow_no_indices=True)
except Exception:
logger.exception("ES delete index error %s" % (indexName))
logging.exception("ES delete index error %s" % (indexName))
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
s = Index(indexName, self.es)
@ -93,7 +93,7 @@ class ESConnection(DocStoreConnection):
try:
return s.exists()
except Exception as e:
logger.exception("ES indexExist")
logging.exception("ES indexExist")
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
continue
return False
@ -178,7 +178,7 @@ class ESConnection(DocStoreConnection):
s = s[offset:limit]
q = s.to_dict()
print(json.dumps(q), flush=True)
# logger.info("ESConnection.search [Q]: " + json.dumps(q))
logging.debug("ESConnection.search [Q]: " + json.dumps(q))
for i in range(3):
try:
@ -190,14 +190,14 @@ class ESConnection(DocStoreConnection):
_source=True)
if str(res.get("timed_out", "")).lower() == "true":
raise Exception("Es Timeout.")
logger.info("ESConnection.search res: " + str(res))
logging.debug("ESConnection.search res: " + str(res))
return res
except Exception as e:
logger.exception("ES search [Q]: " + str(q))
logging.exception("ES search [Q]: " + str(q))
if str(e).find("Timeout") > 0:
continue
raise e
logger.error("ES search timeout for 3 times!")
logging.error("ES search timeout for 3 times!")
raise Exception("ES search timeout.")
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
@ -213,11 +213,11 @@ class ESConnection(DocStoreConnection):
chunk["id"] = chunkId
return chunk
except Exception as e:
logger.exception(f"ES get({chunkId}) got exception")
logging.exception(f"ES get({chunkId}) got exception")
if str(e).find("Timeout") > 0:
continue
raise e
logger.error("ES search timeout for 3 times!")
logging.error("ES search timeout for 3 times!")
raise Exception("ES search timeout.")
def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str) -> list[str]:
@ -247,7 +247,7 @@ class ESConnection(DocStoreConnection):
res.append(str(item[action]["_id"]) + ":" + str(item[action]["error"]))
return res
except Exception as e:
logger.warning("Fail to bulk: " + str(e))
logging.warning("Fail to bulk: " + str(e))
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
time.sleep(3)
continue
@ -264,7 +264,7 @@ class ESConnection(DocStoreConnection):
self.es.update(index=indexName, id=chunkId, doc=doc)
return True
except Exception as e:
logger.exception(
logging.exception(
f"ES failed to update(index={indexName}, id={id}, doc={json.dumps(condition, ensure_ascii=False)})")
if str(e).find("Timeout") > 0:
continue
@ -304,7 +304,7 @@ class ESConnection(DocStoreConnection):
_ = ubq.execute()
return True
except Exception as e:
logger.error("ES update exception: " + str(e) + "[Q]:" + str(bqry.to_dict()))
logging.error("ES update exception: " + str(e) + "[Q]:" + str(bqry.to_dict()))
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
continue
return False
@ -326,7 +326,7 @@ class ESConnection(DocStoreConnection):
qry.must.append(Q("term", **{k: v}))
else:
raise Exception("Condition value must be int, str or list.")
logger.info("ESConnection.delete [Q]: " + json.dumps(qry.to_dict()))
logging.debug("ESConnection.delete [Q]: " + json.dumps(qry.to_dict()))
for _ in range(10):
try:
res = self.es.delete_by_query(
@ -335,7 +335,7 @@ class ESConnection(DocStoreConnection):
refresh=True)
return res["deleted"]
except Exception as e:
logger.warning("Fail to delete: " + str(filter) + str(e))
logging.warning("Fail to delete: " + str(filter) + str(e))
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
time.sleep(3)
continue
@ -419,7 +419,7 @@ class ESConnection(DocStoreConnection):
"""
def sql(self, sql: str, fetch_size: int, format: str):
logger.info(f"ESConnection.sql get sql: {sql}")
logging.debug(f"ESConnection.sql get sql: {sql}")
sql = re.sub(r"[ `]+", " ", sql)
sql = sql.replace("%", "")
replaces = []
@ -436,7 +436,7 @@ class ESConnection(DocStoreConnection):
for p, r in replaces:
sql = sql.replace(p, r, 1)
logger.info(f"ESConnection.sql to es: {sql}")
logging.debug(f"ESConnection.sql to es: {sql}")
for i in range(3):
try:
@ -444,10 +444,10 @@ class ESConnection(DocStoreConnection):
request_timeout="2s")
return res
except ConnectionTimeout:
logger.exception("ESConnection.sql timeout [Q]: " + sql)
logging.exception("ESConnection.sql timeout [Q]: " + sql)
continue
except Exception:
logger.exception("ESConnection.sql got exception [Q]: " + sql)
logging.exception("ESConnection.sql got exception [Q]: " + sql)
return None
logger.error("ESConnection.sql timeout for 3 times!")
logging.error("ESConnection.sql timeout for 3 times!")
return None

View File

@ -1,3 +1,4 @@
import logging
import os
import re
import json
@ -7,7 +8,6 @@ import infinity
from infinity.common import ConflictType, InfinityException
from infinity.index import IndexInfo, IndexType
from infinity.connection_pool import ConnectionPool
from api.utils.log_utils import logger
from rag import settings
from rag.utils import singleton
import polars as pl
@ -56,7 +56,7 @@ class InfinityConnection(DocStoreConnection):
host, port = infinity_uri.split(":")
infinity_uri = infinity.common.NetworkAddress(host, int(port))
self.connPool = None
logger.info(f"Use Infinity {infinity_uri} as the doc engine.")
logging.info(f"Use Infinity {infinity_uri} as the doc engine.")
for _ in range(24):
try:
connPool = ConnectionPool(infinity_uri)
@ -66,13 +66,13 @@ class InfinityConnection(DocStoreConnection):
self.connPool = connPool
break
except Exception as e:
logger.warn(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
logging.warn(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
time.sleep(5)
if self.connPool is None:
msg = f"Infinity {infinity_uri} didn't become healthy in 120s."
logger.error(msg)
logging.error(msg)
raise Exception(msg)
logger.info(f"Infinity {infinity_uri} is healthy.")
logging.info(f"Infinity {infinity_uri} is healthy.")
"""
Database operations
@ -148,7 +148,7 @@ class InfinityConnection(DocStoreConnection):
)
break
self.connPool.release_conn(inf_conn)
logger.info(
logging.info(
f"INFINITY created table {table_name}, vector size {vectorSize}"
)
@ -158,7 +158,7 @@ class InfinityConnection(DocStoreConnection):
db_instance = inf_conn.get_database(self.dbName)
db_instance.drop_table(table_name, ConflictType.Ignore)
self.connPool.release_conn(inf_conn)
logger.info(f"INFINITY dropped table {table_name}")
logging.info(f"INFINITY dropped table {table_name}")
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
table_name = f"{indexName}_{knowledgebaseId}"
@ -169,7 +169,7 @@ class InfinityConnection(DocStoreConnection):
self.connPool.release_conn(inf_conn)
return True
except Exception as e:
logger.warn(f"INFINITY indexExist {str(e)}")
logging.warn(f"INFINITY indexExist {str(e)}")
return False
"""
@ -216,7 +216,7 @@ class InfinityConnection(DocStoreConnection):
)
if len(filter_cond) != 0:
filter_fulltext = f"({filter_cond}) AND {filter_fulltext}"
# logger.info(f"filter_fulltext: {filter_fulltext}")
logging.debug(f"filter_fulltext: {filter_fulltext}")
minimum_should_match = "0%"
if "minimum_should_match" in matchExpr.extra_options:
minimum_should_match = (
@ -279,7 +279,7 @@ class InfinityConnection(DocStoreConnection):
df_list.append(kb_res)
self.connPool.release_conn(inf_conn)
res = pl.concat(df_list)
logger.info("INFINITY search tables: " + str(table_list))
logging.debug("INFINITY search tables: " + str(table_list))
return res
def get(
@ -334,18 +334,18 @@ class InfinityConnection(DocStoreConnection):
str_filter = f"id IN ({str_ids})"
table_instance.delete(str_filter)
# for doc in documents:
# logger.info(f"insert position_list: {doc['position_list']}")
# logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
# logging.info(f"insert position_list: {doc['position_list']}")
# logging.info(f"InfinityConnection.insert {json.dumps(documents)}")
table_instance.insert(documents)
self.connPool.release_conn(inf_conn)
logger.info(f"inserted into {table_name} {str_ids}.")
logging.debug(f"inserted into {table_name} {str_ids}.")
return []
def update(
self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
) -> bool:
# if 'position_list' in newValue:
# logger.info(f"upsert position_list: {newValue['position_list']}")
# logging.info(f"upsert position_list: {newValue['position_list']}")
inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName)
table_name = f"{indexName}_{knowledgebaseId}"
@ -366,7 +366,7 @@ class InfinityConnection(DocStoreConnection):
try:
table_instance = db_instance.get_table(table_name)
except Exception:
logger.warning(
logging.warning(
f"Skipped deleting `{filter}` from table {table_name} since the table doesn't exist."
)
return 0

View File

@ -1,9 +1,9 @@
import logging
import time
from minio import Minio
from io import BytesIO
from rag import settings
from rag.utils import singleton
from api.utils.log_utils import logger
@singleton
@ -26,7 +26,7 @@ class RAGFlowMinio(object):
secure=False
)
except Exception:
logger.exception(
logging.exception(
"Fail to connect %s " % settings.MINIO["host"])
def __close__(self):
@ -55,7 +55,7 @@ class RAGFlowMinio(object):
)
return r
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
logging.exception(f"Fail put {bucket}/{fnm}:")
self.__open__()
time.sleep(1)
@ -63,7 +63,7 @@ class RAGFlowMinio(object):
try:
self.conn.remove_object(bucket, fnm)
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
logging.exception(f"Fail put {bucket}/{fnm}:")
def get(self, bucket, fnm):
for _ in range(1):
@ -71,7 +71,7 @@ class RAGFlowMinio(object):
r = self.conn.get_object(bucket, fnm)
return r.read()
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
logging.exception(f"Fail put {bucket}/{fnm}:")
self.__open__()
time.sleep(1)
return
@ -81,7 +81,7 @@ class RAGFlowMinio(object):
if self.conn.stat_object(bucket, fnm):return True
return False
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
logging.exception(f"Fail put {bucket}/{fnm}:")
return False
@ -90,7 +90,7 @@ class RAGFlowMinio(object):
try:
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
logging.exception(f"Fail put {bucket}/{fnm}:")
self.__open__()
time.sleep(1)
return

View File

@ -1,7 +1,7 @@
import logging
import json
import valkey as redis
import logging
from rag import settings
from rag.utils import singleton

View File

@ -1,3 +1,4 @@
import logging
import boto3
import os
from botocore.exceptions import ClientError
@ -40,7 +41,7 @@ class RAGFlowS3(object):
config=config
)
except Exception:
logger.exception(
logging.exception(
"Fail to connect %s" % self.endpoint)
def __close__(self):
@ -49,11 +50,11 @@ class RAGFlowS3(object):
def bucket_exists(self, bucket):
try:
logger.debug(f"head_bucket bucketname {bucket}")
logging.debug(f"head_bucket bucketname {bucket}")
self.conn.head_bucket(Bucket=bucket)
exists = True
except ClientError:
logger.exception(f"head_bucket error {bucket}")
logging.exception(f"head_bucket error {bucket}")
exists = False
return exists
@ -62,7 +63,7 @@ class RAGFlowS3(object):
if not self.bucket_exists(bucket):
self.conn.create_bucket(Bucket=bucket)
logger.debug(f"create bucket {bucket} ********")
logging.debug(f"create bucket {bucket} ********")
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
return r
@ -74,17 +75,17 @@ class RAGFlowS3(object):
return []
def put(self, bucket, fnm, binary):
logger.debug(f"bucket name {bucket}; filename :{fnm}:")
logging.debug(f"bucket name {bucket}; filename :{fnm}:")
for _ in range(1):
try:
if not self.bucket_exists(bucket):
self.conn.create_bucket(Bucket=bucket)
logger.info(f"create bucket {bucket} ********")
logging.info(f"create bucket {bucket} ********")
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
return r
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
logging.exception(f"Fail put {bucket}/{fnm}")
self.__open__()
time.sleep(1)
@ -92,7 +93,7 @@ class RAGFlowS3(object):
try:
self.conn.delete_object(Bucket=bucket, Key=fnm)
except Exception:
logger.exception(f"Fail rm {bucket}/{fnm}")
logging.exception(f"Fail rm {bucket}/{fnm}")
def get(self, bucket, fnm):
for _ in range(1):
@ -101,7 +102,7 @@ class RAGFlowS3(object):
object_data = r['Body'].read()
return object_data
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
logging.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return
@ -128,7 +129,7 @@ class RAGFlowS3(object):
return r
except Exception:
logger.exception(f"fail get url {bucket}/{fnm}")
logging.exception(f"fail get url {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return