mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 14:38:59 +08:00
Rework logging (#3358)
Unified all log files into one. ### What problem does this PR solve? Unified all log files into one. ### Type of change - [x] Refactoring
This commit is contained in:
parent
567a7563e7
commit
a2a5631da4
@ -14,14 +14,12 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import json
|
||||
import traceback
|
||||
from abc import ABC
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
from agent.component import component_class
|
||||
from agent.component.base import ComponentBase
|
||||
from agent.settings import flow_logger, DEBUG
|
||||
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
class Canvas(ABC):
|
||||
"""
|
||||
@ -189,7 +187,7 @@ class Canvas(ABC):
|
||||
if cpn.component_name == "Answer":
|
||||
self.answer.append(c)
|
||||
else:
|
||||
if DEBUG: print("RUN: ", c)
|
||||
logger.debug(f"Canvas.prepare2run: {c}")
|
||||
cpids = cpn.get_dependent_components()
|
||||
if any([c not in self.path[-1] for c in cpids]):
|
||||
continue
|
||||
@ -199,7 +197,7 @@ class Canvas(ABC):
|
||||
|
||||
prepare2run(self.components[self.path[-2][-1]]["downstream"])
|
||||
while 0 <= ran < len(self.path[-1]):
|
||||
if DEBUG: print(ran, self.path)
|
||||
logger.debug(f"Canvas.run: {ran} {self.path}")
|
||||
cpn_id = self.path[-1][ran]
|
||||
cpn = self.get_component(cpn_id)
|
||||
if not cpn["downstream"]: break
|
||||
@ -219,7 +217,7 @@ class Canvas(ABC):
|
||||
self.get_component(p)["obj"].set_exception(e)
|
||||
prepare2run([p])
|
||||
break
|
||||
traceback.print_exc()
|
||||
logger.exception("Canvas.run got exception")
|
||||
break
|
||||
continue
|
||||
|
||||
@ -231,7 +229,7 @@ class Canvas(ABC):
|
||||
self.get_component(p)["obj"].set_exception(e)
|
||||
prepare2run([p])
|
||||
break
|
||||
traceback.print_exc()
|
||||
logger.exception("Canvas.run got exception")
|
||||
break
|
||||
|
||||
if self.answer:
|
||||
|
@ -16,9 +16,8 @@
|
||||
from abc import ABC
|
||||
import arxiv
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
class ArXivParam(ComponentParamBase):
|
||||
"""
|
||||
@ -65,5 +64,5 @@ class ArXiv(ComponentBase, ABC):
|
||||
return ArXiv.be_output("")
|
||||
|
||||
df = pd.DataFrame(arxiv_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug(f"df: {str(df)}")
|
||||
return df
|
||||
|
@ -13,14 +13,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import random
|
||||
from abc import ABC
|
||||
from functools import partial
|
||||
import pandas as pd
|
||||
import requests
|
||||
import re
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class BaiduParam(ComponentParamBase):
|
||||
@ -64,6 +62,6 @@ class Baidu(ComponentBase, ABC):
|
||||
return Baidu.be_output("")
|
||||
|
||||
df = pd.DataFrame(baidu_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug(f"df: {str(df)}")
|
||||
return df
|
||||
|
||||
|
@ -17,14 +17,14 @@ from abc import ABC
|
||||
import builtins
|
||||
import json
|
||||
import os
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
from typing import List, Dict, Tuple, Union
|
||||
from typing import Tuple, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from agent import settings
|
||||
from agent.settings import flow_logger, DEBUG
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
_FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params"
|
||||
_DEPRECATED_PARAMS = "_deprecated_params"
|
||||
@ -361,13 +361,13 @@ class ComponentParamBase(ABC):
|
||||
|
||||
def _warn_deprecated_param(self, param_name, descr):
|
||||
if self._deprecated_params_set.get(param_name):
|
||||
flow_logger.warning(
|
||||
logger.warning(
|
||||
f"{descr} {param_name} is deprecated and ignored in this version."
|
||||
)
|
||||
|
||||
def _warn_to_deprecate_param(self, param_name, descr, new_param):
|
||||
if self._deprecated_params_set.get(param_name):
|
||||
flow_logger.warning(
|
||||
logger.warning(
|
||||
f"{descr} {param_name} will be deprecated in future release; "
|
||||
f"please use {new_param} instead."
|
||||
)
|
||||
@ -403,7 +403,7 @@ class ComponentBase(ABC):
|
||||
return cpnts
|
||||
|
||||
def run(self, history, **kwargs):
|
||||
flow_logger.info("{}, history: {}, kwargs: {}".format(self, json.dumps(history, ensure_ascii=False),
|
||||
logger.info("{}, history: {}, kwargs: {}".format(self, json.dumps(history, ensure_ascii=False),
|
||||
json.dumps(kwargs, ensure_ascii=False)))
|
||||
try:
|
||||
res = self._run(history, **kwargs)
|
||||
@ -463,7 +463,7 @@ class ComponentBase(ABC):
|
||||
reversed_cpnts.extend(self._canvas.path[-2])
|
||||
reversed_cpnts.extend(self._canvas.path[-1])
|
||||
|
||||
if DEBUG: print(self.component_name, reversed_cpnts[::-1])
|
||||
logger.debug(f"{self.component_name} {reversed_cpnts[::-1]}")
|
||||
for u in reversed_cpnts[::-1]:
|
||||
if self.get_component_name(u) in ["switch", "concentrator"]: continue
|
||||
if self.component_name.lower() == "generate" and self.get_component_name(u) == "retrieval":
|
||||
|
@ -16,9 +16,8 @@
|
||||
from abc import ABC
|
||||
import requests
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
class BingParam(ComponentParamBase):
|
||||
"""
|
||||
@ -81,5 +80,5 @@ class Bing(ComponentBase, ABC):
|
||||
return Bing.be_output("")
|
||||
|
||||
df = pd.DataFrame(bing_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug(f"df: {str(df)}")
|
||||
return df
|
||||
|
@ -17,7 +17,7 @@ from abc import ABC
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from agent.component import GenerateParam, Generate
|
||||
from agent.settings import DEBUG
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class CategorizeParam(GenerateParam):
|
||||
@ -34,7 +34,7 @@ class CategorizeParam(GenerateParam):
|
||||
super().check()
|
||||
self.check_empty(self.category_description, "[Categorize] Category examples")
|
||||
for k, v in self.category_description.items():
|
||||
if not k: raise ValueError(f"[Categorize] Category name can not be empty!")
|
||||
if not k: raise ValueError("[Categorize] Category name can not be empty!")
|
||||
if not v.get("to"): raise ValueError(f"[Categorize] 'To' of category {k} can not be empty!")
|
||||
|
||||
def get_prompt(self):
|
||||
@ -77,7 +77,7 @@ class Categorize(Generate, ABC):
|
||||
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
|
||||
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": input}],
|
||||
self._param.gen_conf())
|
||||
if DEBUG: print(ans, ":::::::::::::::::::::::::::::::::", input)
|
||||
logger.debug(f"input: {input}, answer: {str(ans)}")
|
||||
for c in self._param.category_description.keys():
|
||||
if ans.lower().find(c.lower()) >= 0:
|
||||
return Categorize.be_output(self._param.category_description[c]["to"])
|
||||
|
@ -16,8 +16,8 @@
|
||||
from abc import ABC
|
||||
from duckduckgo_search import DDGS
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class DuckDuckGoParam(ComponentParamBase):
|
||||
@ -62,5 +62,5 @@ class DuckDuckGo(ComponentBase, ABC):
|
||||
return DuckDuckGo.be_output("")
|
||||
|
||||
df = pd.DataFrame(duck_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug("df: {df}")
|
||||
return df
|
||||
|
@ -16,8 +16,8 @@
|
||||
from abc import ABC
|
||||
import pandas as pd
|
||||
import requests
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class GitHubParam(ComponentParamBase):
|
||||
@ -57,5 +57,5 @@ class GitHub(ComponentBase, ABC):
|
||||
return GitHub.be_output("")
|
||||
|
||||
df = pd.DataFrame(github_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug(f"df: {df}")
|
||||
return df
|
||||
|
@ -16,8 +16,8 @@
|
||||
from abc import ABC
|
||||
from serpapi import GoogleSearch
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class GoogleParam(ComponentParamBase):
|
||||
@ -85,12 +85,12 @@ class Google(ComponentBase, ABC):
|
||||
"hl": self._param.language, "num": self._param.top_n})
|
||||
google_res = [{"content": '<a href="' + i["link"] + '">' + i["title"] + '</a> ' + i["snippet"]} for i in
|
||||
client.get_dict()["organic_results"]]
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
return Google.be_output("**ERROR**: Existing Unavailable Parameters!")
|
||||
|
||||
if not google_res:
|
||||
return Google.be_output("")
|
||||
|
||||
df = pd.DataFrame(google_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug(f"df: {df}")
|
||||
return df
|
||||
|
@ -15,9 +15,9 @@
|
||||
#
|
||||
from abc import ABC
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from scholarly import scholarly
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class GoogleScholarParam(ComponentParamBase):
|
||||
@ -58,13 +58,13 @@ class GoogleScholar(ComponentBase, ABC):
|
||||
'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[
|
||||
'bib'].get('abstract', 'no abstract')})
|
||||
|
||||
except StopIteration or Exception as e:
|
||||
print("**ERROR** " + str(e))
|
||||
except StopIteration or Exception:
|
||||
logger.exception("GoogleScholar")
|
||||
break
|
||||
|
||||
if not scholar_res:
|
||||
return GoogleScholar.be_output("")
|
||||
|
||||
df = pd.DataFrame(scholar_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug(f"df: {df}")
|
||||
return df
|
||||
|
@ -18,7 +18,7 @@ from abc import ABC
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from agent.component import GenerateParam, Generate
|
||||
from agent.settings import DEBUG
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class KeywordExtractParam(GenerateParam):
|
||||
@ -58,5 +58,5 @@ class KeywordExtract(Generate, ABC):
|
||||
self._param.gen_conf())
|
||||
|
||||
ans = re.sub(r".*keyword:", "", ans).strip()
|
||||
if DEBUG: print(ans, ":::::::::::::::::::::::::::::::::")
|
||||
logger.info(f"ans: {ans}")
|
||||
return KeywordExtract.be_output(ans)
|
||||
|
@ -18,8 +18,8 @@ from Bio import Entrez
|
||||
import re
|
||||
import pandas as pd
|
||||
import xml.etree.ElementTree as ET
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class PubMedParam(ComponentParamBase):
|
||||
@ -65,5 +65,5 @@ class PubMed(ComponentBase, ABC):
|
||||
return PubMed.be_output("")
|
||||
|
||||
df = pd.DataFrame(pubmed_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug(f"df: {df}")
|
||||
return df
|
||||
|
@ -18,6 +18,7 @@ from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from agent.component import GenerateParam, Generate
|
||||
from rag.utils import num_tokens_from_string, encoder
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class RelevantParam(GenerateParam):
|
||||
@ -70,7 +71,7 @@ class Relevant(Generate, ABC):
|
||||
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": ans}],
|
||||
self._param.gen_conf())
|
||||
|
||||
print(ans, ":::::::::::::::::::::::::::::::::")
|
||||
logger.info(ans)
|
||||
if ans.lower().find("yes") >= 0:
|
||||
return Relevant.be_output(self._param.yes)
|
||||
if ans.lower().find("no") >= 0:
|
||||
|
@ -22,6 +22,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.settings import retrievaler
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class RetrievalParam(ComponentParamBase):
|
||||
@ -80,7 +81,7 @@ class Retrieval(ComponentBase, ABC):
|
||||
df = pd.DataFrame(kbinfos["chunks"])
|
||||
df["content"] = df["content_with_weight"]
|
||||
del df["content_with_weight"]
|
||||
print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", query, df)
|
||||
logger.debug("{} {}".format(query, df))
|
||||
return df
|
||||
|
||||
|
||||
|
@ -17,6 +17,7 @@ from abc import ABC
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from agent.component import GenerateParam, Generate
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class RewriteQuestionParam(GenerateParam):
|
||||
@ -104,7 +105,7 @@ class RewriteQuestion(Generate, ABC):
|
||||
self._canvas.history.pop()
|
||||
self._canvas.history.append(("user", ans))
|
||||
|
||||
print(ans, ":::::::::::::::::::::::::::::::::")
|
||||
logger.info(ans)
|
||||
return RewriteQuestion.be_output(ans)
|
||||
|
||||
|
||||
|
@ -13,13 +13,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import random
|
||||
from abc import ABC
|
||||
from functools import partial
|
||||
import wikipedia
|
||||
import pandas as pd
|
||||
from agent.settings import DEBUG
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class WikipediaParam(ComponentParamBase):
|
||||
@ -65,5 +63,5 @@ class Wikipedia(ComponentBase, ABC):
|
||||
return Wikipedia.be_output("")
|
||||
|
||||
df = pd.DataFrame(wiki_res)
|
||||
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
|
||||
logger.debug(f"df: {df}")
|
||||
return df
|
||||
|
@ -17,6 +17,7 @@ from abc import ABC
|
||||
import pandas as pd
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
import yfinance as yf
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class YahooFinanceParam(ComponentParamBase):
|
||||
@ -74,8 +75,8 @@ class YahooFinance(ComponentBase, ABC):
|
||||
{"content": "quarterly cash flow statement:\n" + msft.quarterly_cashflow.to_markdown() + "\n"})
|
||||
if self._param.news:
|
||||
yohoo_res.append({"content": "news:\n" + pd.DataFrame(msft.news).to_markdown() + "\n"})
|
||||
except Exception as e:
|
||||
print("**ERROR** " + str(e))
|
||||
except Exception:
|
||||
logger.exception("YahooFinance got exception")
|
||||
|
||||
if not yohoo_res:
|
||||
return YahooFinance.be_output("")
|
||||
|
@ -13,22 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Logger
|
||||
import os
|
||||
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import LoggerFactory, getLogger
|
||||
|
||||
DEBUG = 0
|
||||
LoggerFactory.set_directory(
|
||||
os.path.join(
|
||||
get_project_base_directory(),
|
||||
"logs",
|
||||
"flow"))
|
||||
# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
|
||||
LoggerFactory.LEVEL = 30
|
||||
|
||||
flow_logger = getLogger("flow")
|
||||
database_logger = getLogger("database")
|
||||
FLOAT_ZERO = 1e-8
|
||||
PARAM_MAXDEPTH = 5
|
||||
|
@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from importlib.util import module_from_spec, spec_from_file_location
|
||||
@ -30,18 +29,14 @@ from api.utils import CustomJSONEncoder, commands
|
||||
|
||||
from flask_session import Session
|
||||
from flask_login import LoginManager
|
||||
from api.settings import SECRET_KEY, stat_logger
|
||||
from api.settings import API_VERSION, access_logger
|
||||
from api.settings import SECRET_KEY
|
||||
from api.settings import API_VERSION
|
||||
from api.utils.api_utils import server_error_response
|
||||
from api.utils.log_utils import logger
|
||||
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
|
||||
|
||||
__all__ = ["app"]
|
||||
|
||||
|
||||
logger = logging.getLogger("flask.app")
|
||||
for h in access_logger.handlers:
|
||||
logger.addHandler(h)
|
||||
|
||||
Request.json = property(lambda self: self.get_json(force=True, silent=True))
|
||||
|
||||
app = Flask(__name__)
|
||||
@ -158,8 +153,8 @@ def load_user(web_request):
|
||||
return user[0]
|
||||
else:
|
||||
return None
|
||||
except Exception as e:
|
||||
stat_logger.exception(e)
|
||||
except Exception:
|
||||
logger.exception("load_user got exception")
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
@ -23,6 +23,7 @@ from api.utils import get_uuid
|
||||
from api.utils.api_utils import get_json_result, server_error_response, validate_request, get_data_error_result
|
||||
from agent.canvas import Canvas
|
||||
from peewee import MySQLDatabase, PostgresqlDatabase
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
@manager.route('/templates', methods=['GET'])
|
||||
@ -114,7 +115,7 @@ def run():
|
||||
pass
|
||||
canvas.add_user_input(req["message"])
|
||||
answer = canvas.run(stream=stream)
|
||||
print(canvas)
|
||||
logger.info(canvas)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
@ -25,6 +25,7 @@ from api.db.db_models import TenantLLM
|
||||
from api.utils.api_utils import get_json_result
|
||||
from rag.llm import EmbeddingModel, ChatModel, RerankModel, CvModel, TTSModel
|
||||
import requests
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
@manager.route('/factories', methods=['GET'])
|
||||
@ -89,7 +90,7 @@ def set_api_key():
|
||||
if len(arr) == 0 or tc == 0:
|
||||
raise Exception("Fail")
|
||||
rerank_passed = True
|
||||
print(f'passed model rerank{llm.llm_name}',flush=True)
|
||||
logger.info(f'passed model rerank {llm.llm_name}')
|
||||
except Exception as e:
|
||||
msg += f"\nFail to access model({llm.llm_name}) using this api key." + str(
|
||||
e)
|
||||
|
@ -526,4 +526,4 @@ def list(tenant_id):
|
||||
new_key = key_mapping.get(key, key)
|
||||
renamed_data[new_key] = value
|
||||
renamed_list.append(renamed_data)
|
||||
return get_result(data=renamed_list)
|
||||
return get_result(data=renamed_list)
|
||||
|
@ -53,8 +53,8 @@ from api.settings import (
|
||||
)
|
||||
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.settings import stat_logger
|
||||
from api.utils.api_utils import get_json_result, construct_response
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
@manager.route("/login", methods=["POST", "GET"])
|
||||
@ -177,7 +177,7 @@ def github_callback():
|
||||
try:
|
||||
avatar = download_img(user_info["avatar_url"])
|
||||
except Exception as e:
|
||||
stat_logger.exception(e)
|
||||
logger.exception(e)
|
||||
avatar = ""
|
||||
users = user_register(
|
||||
user_id,
|
||||
@ -202,7 +202,7 @@ def github_callback():
|
||||
return redirect("/?auth=%s" % user.get_id())
|
||||
except Exception as e:
|
||||
rollback_user_registration(user_id)
|
||||
stat_logger.exception(e)
|
||||
logger.exception(e)
|
||||
return redirect("/?error=%s" % str(e))
|
||||
|
||||
# User has already registered, try to log in
|
||||
@ -279,7 +279,7 @@ def feishu_callback():
|
||||
try:
|
||||
avatar = download_img(user_info["avatar_url"])
|
||||
except Exception as e:
|
||||
stat_logger.exception(e)
|
||||
logger.exception(e)
|
||||
avatar = ""
|
||||
users = user_register(
|
||||
user_id,
|
||||
@ -304,7 +304,7 @@ def feishu_callback():
|
||||
return redirect("/?auth=%s" % user.get_id())
|
||||
except Exception as e:
|
||||
rollback_user_registration(user_id)
|
||||
stat_logger.exception(e)
|
||||
logger.exception(e)
|
||||
return redirect("/?error=%s" % str(e))
|
||||
|
||||
# User has already registered, try to log in
|
||||
@ -436,7 +436,7 @@ def setting_user():
|
||||
UserService.update_by_id(current_user.id, update_dict)
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
stat_logger.exception(e)
|
||||
logger.exception(e)
|
||||
return get_json_result(
|
||||
data=False, message="Update failure!", code=RetCode.EXCEPTION_ERROR
|
||||
)
|
||||
@ -621,7 +621,7 @@ def user_add():
|
||||
)
|
||||
except Exception as e:
|
||||
rollback_user_registration(user_id)
|
||||
stat_logger.exception(e)
|
||||
logger.exception(e)
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message=f"User registration failure, error: {str(e)}",
|
||||
|
@ -30,12 +30,9 @@ from peewee import (
|
||||
)
|
||||
from playhouse.pool import PooledMySQLDatabase, PooledPostgresqlDatabase
|
||||
from api.db import SerializedType, ParserType
|
||||
from api.settings import DATABASE, stat_logger, SECRET_KEY, DATABASE_TYPE
|
||||
from api.utils.log_utils import getLogger
|
||||
from api.settings import DATABASE, SECRET_KEY, DATABASE_TYPE
|
||||
from api import utils
|
||||
|
||||
LOGGER = getLogger()
|
||||
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
def singleton(cls, *args, **kw):
|
||||
instances = {}
|
||||
@ -288,7 +285,7 @@ class BaseDataBase:
|
||||
database_config = DATABASE.copy()
|
||||
db_name = database_config.pop("name")
|
||||
self.database_connection = PooledDatabase[DATABASE_TYPE.upper()].value(db_name, **database_config)
|
||||
stat_logger.info('init database on cluster mode successfully')
|
||||
logger.info('init database on cluster mode successfully')
|
||||
|
||||
class PostgresDatabaseLock:
|
||||
def __init__(self, lock_name, timeout=10, db=None):
|
||||
@ -396,7 +393,7 @@ def close_connection():
|
||||
if DB:
|
||||
DB.close_stale(age=30)
|
||||
except Exception as e:
|
||||
LOGGER.exception(e)
|
||||
logger.exception(e)
|
||||
|
||||
|
||||
class DataBaseModel(BaseModel):
|
||||
@ -412,15 +409,15 @@ def init_database_tables(alter_fields=[]):
|
||||
for name, obj in members:
|
||||
if obj != DataBaseModel and issubclass(obj, DataBaseModel):
|
||||
table_objs.append(obj)
|
||||
LOGGER.info(f"start create table {obj.__name__}")
|
||||
logger.info(f"start create table {obj.__name__}")
|
||||
try:
|
||||
obj.create_table()
|
||||
LOGGER.info(f"create table success: {obj.__name__}")
|
||||
logger.info(f"create table success: {obj.__name__}")
|
||||
except Exception as e:
|
||||
LOGGER.exception(e)
|
||||
logger.exception(e)
|
||||
create_failed_list.append(obj.__name__)
|
||||
if create_failed_list:
|
||||
LOGGER.info(f"create tables failed: {create_failed_list}")
|
||||
logger.info(f"create tables failed: {create_failed_list}")
|
||||
raise Exception(f"create tables failed: {create_failed_list}")
|
||||
migrate_db()
|
||||
|
||||
|
@ -22,12 +22,6 @@ from playhouse.pool import PooledMySQLDatabase
|
||||
from api.utils import current_timestamp, timestamp_to_date
|
||||
|
||||
from api.db.db_models import DB, DataBaseModel
|
||||
from api.db.runtime_config import RuntimeConfig
|
||||
from api.utils.log_utils import getLogger
|
||||
from enum import Enum
|
||||
|
||||
|
||||
LOGGER = getLogger()
|
||||
|
||||
|
||||
@DB.connection_context()
|
||||
|
@ -30,6 +30,7 @@ from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantL
|
||||
from api.db.services.user_service import TenantService, UserTenantService
|
||||
from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
def encode_to_base64(input_string):
|
||||
@ -69,36 +70,34 @@ def init_superuser():
|
||||
"api_key": API_KEY, "api_base": LLM_BASE_URL})
|
||||
|
||||
if not UserService.save(**user_info):
|
||||
print("\033[93m【ERROR】\033[0mcan't init admin.")
|
||||
logger.info("can't init admin.")
|
||||
return
|
||||
TenantService.insert(**tenant)
|
||||
UserTenantService.insert(**usr_tenant)
|
||||
TenantLLMService.insert_many(tenant_llm)
|
||||
print(
|
||||
"【INFO】Super user initialized. \033[93memail: admin@ragflow.io, password: admin\033[0m. Changing the password after logining is strongly recomanded.")
|
||||
logger.info(
|
||||
"Super user initialized. email: admin@ragflow.io, password: admin. Changing the password after logining is strongly recomanded.")
|
||||
|
||||
chat_mdl = LLMBundle(tenant["id"], LLMType.CHAT, tenant["llm_id"])
|
||||
msg = chat_mdl.chat(system="", history=[
|
||||
{"role": "user", "content": "Hello!"}], gen_conf={})
|
||||
if msg.find("ERROR: ") == 0:
|
||||
print(
|
||||
"\33[91m【ERROR】\33[0m: ",
|
||||
logger.error(
|
||||
"'{}' dosen't work. {}".format(
|
||||
tenant["llm_id"],
|
||||
msg))
|
||||
embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"])
|
||||
v, c = embd_mdl.encode(["Hello!"])
|
||||
if c == 0:
|
||||
print(
|
||||
"\33[91m【ERROR】\33[0m:",
|
||||
" '{}' dosen't work!".format(
|
||||
logger.error(
|
||||
"'{}' dosen't work!".format(
|
||||
tenant["embd_id"]))
|
||||
|
||||
|
||||
def init_llm_factory():
|
||||
try:
|
||||
LLMService.filter_delete([(LLM.fid == "MiniMax" or LLM.fid == "Minimax")])
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
factory_llm_infos = json.load(
|
||||
@ -111,14 +110,14 @@ def init_llm_factory():
|
||||
llm_infos = factory_llm_info.pop("llm")
|
||||
try:
|
||||
LLMFactoriesService.save(**factory_llm_info)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
LLMService.filter_delete([LLM.fid == factory_llm_info["name"]])
|
||||
for llm_info in llm_infos:
|
||||
llm_info["fid"] = factory_llm_info["name"]
|
||||
try:
|
||||
LLMService.save(**llm_info)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
LLMFactoriesService.filter_delete([LLMFactories.name == "Local"])
|
||||
@ -145,7 +144,7 @@ def init_llm_factory():
|
||||
row = deepcopy(row)
|
||||
row["llm_name"] = "text-embedding-3-large"
|
||||
TenantLLMService.save(**row)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
for kb_id in KnowledgebaseService.get_all_ids():
|
||||
@ -169,9 +168,8 @@ def add_graph_templates():
|
||||
CanvasTemplateService.save(**cnvs)
|
||||
except:
|
||||
CanvasTemplateService.update_by_id(cnvs["id"], cnvs)
|
||||
except Exception as e:
|
||||
print("Add graph templates error: ", e)
|
||||
print("------------", flush=True)
|
||||
except Exception:
|
||||
logger.exception("Add graph templates error: ")
|
||||
|
||||
|
||||
def init_web_data():
|
||||
@ -182,7 +180,7 @@ def init_web_data():
|
||||
# init_superuser()
|
||||
|
||||
add_graph_templates()
|
||||
print("init web data success:{}".format(time.time() - start_time))
|
||||
logger.info("init web data success:{}".format(time.time() - start_time))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -1,21 +0,0 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import operator
|
||||
import time
|
||||
import typing
|
||||
from api.utils.log_utils import sql_logger
|
||||
import peewee
|
@ -26,11 +26,12 @@ from api.db.db_models import Dialog, Conversation,DB
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
|
||||
from api.settings import chat_logger, retrievaler, kg_retrievaler
|
||||
from api.settings import retrievaler, kg_retrievaler
|
||||
from rag.app.resume import forbidden_select_fields4resume
|
||||
from rag.nlp.search import index_name
|
||||
from rag.utils import rmSpace, num_tokens_from_string, encoder
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class DialogService(CommonService):
|
||||
@ -177,7 +178,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||
tts_mdl = LLMBundle(dialog.tenant_id, LLMType.TTS)
|
||||
# try to use sql if field mapping is good to go
|
||||
if field_map:
|
||||
chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
|
||||
logger.info("Use SQL to retrieval:{}".format(questions[-1]))
|
||||
ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True))
|
||||
if ans:
|
||||
yield ans
|
||||
@ -219,7 +220,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||
doc_ids=attachments,
|
||||
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
||||
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
||||
chat_logger.info(
|
||||
logger.info(
|
||||
"{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
|
||||
retrieval_tm = timer()
|
||||
|
||||
@ -291,7 +292,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||
yield decorate_answer(answer)
|
||||
else:
|
||||
answer = chat_mdl.chat(prompt, msg[1:], gen_conf)
|
||||
chat_logger.info("User: {}|Assistant: {}".format(
|
||||
logger.info("User: {}|Assistant: {}".format(
|
||||
msg[-1]["content"], answer))
|
||||
res = decorate_answer(answer)
|
||||
res["audio_binary"] = tts(tts_mdl, answer)
|
||||
@ -319,8 +320,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
||||
nonlocal sys_prompt, user_promt, question, tried_times
|
||||
sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {
|
||||
"temperature": 0.06})
|
||||
print(user_promt, sql)
|
||||
chat_logger.info(f"“{question}”==>{user_promt} get SQL: {sql}")
|
||||
logger.info(f"{question} ==> {user_promt} get SQL: {sql}")
|
||||
sql = re.sub(r"[\r\n]+", " ", sql.lower())
|
||||
sql = re.sub(r".*select ", "select ", sql.lower())
|
||||
sql = re.sub(r" +", " ", sql)
|
||||
@ -340,9 +340,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
||||
flds.append(k)
|
||||
sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:]
|
||||
|
||||
print(f"“{question}” get SQL(refined): {sql}")
|
||||
|
||||
chat_logger.info(f"“{question}” get SQL(refined): {sql}")
|
||||
logger.info(f"{question} get SQL(refined): {sql}")
|
||||
tried_times += 1
|
||||
return retrievaler.sql_retrieval(sql, format="json"), sql
|
||||
|
||||
@ -371,10 +369,9 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
||||
question, sql, tbl["error"]
|
||||
)
|
||||
tbl, sql = get_table()
|
||||
chat_logger.info("TRY it again: {}".format(sql))
|
||||
logger.info("TRY it again: {}".format(sql))
|
||||
|
||||
chat_logger.info("GET table: {}".format(tbl))
|
||||
print(tbl)
|
||||
logger.info("GET table: {}".format(tbl))
|
||||
if tbl.get("error") or len(tbl["rows"]) == 0:
|
||||
return None
|
||||
|
||||
@ -404,7 +401,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
||||
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
||||
|
||||
if not docid_idx or not docnm_idx:
|
||||
chat_logger.warning("SQL missing field: " + sql)
|
||||
logger.warning("SQL missing field: " + sql)
|
||||
return {
|
||||
"answer": "\n".join([clmns, line, rows]),
|
||||
"reference": {"chunks": [], "doc_aggs": []},
|
||||
|
@ -17,7 +17,6 @@ import hashlib
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
@ -26,7 +25,7 @@ from io import BytesIO
|
||||
from peewee import fn
|
||||
|
||||
from api.db.db_utils import bulk_insert_into_db
|
||||
from api.settings import stat_logger, docStoreConn
|
||||
from api.settings import docStoreConn
|
||||
from api.utils import current_timestamp, get_format_time, get_uuid
|
||||
from graphrag.mind_map_extractor import MindMapExtractor
|
||||
from rag.settings import SVR_QUEUE_NAME
|
||||
@ -40,6 +39,7 @@ from api.db.services.common_service import CommonService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db import StatusEnum
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class DocumentService(CommonService):
|
||||
@ -387,7 +387,7 @@ class DocumentService(CommonService):
|
||||
cls.update_by_id(d["id"], info)
|
||||
except Exception as e:
|
||||
if str(e).find("'0'") < 0:
|
||||
stat_logger.error("fetch task exception:" + str(e))
|
||||
logger.exception("fetch task exception")
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
@ -544,7 +544,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
||||
"knowledge_graph_kwd": "mind_map"
|
||||
})
|
||||
except Exception as e:
|
||||
stat_logger.error("Mind map generation error:", traceback.format_exc())
|
||||
logger.exception("Mind map generation error")
|
||||
|
||||
vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
|
||||
assert len(cks) == len(vects)
|
||||
|
@ -28,6 +28,7 @@ from api.db.services.file2document_service import File2DocumentService
|
||||
from api.utils import get_uuid
|
||||
from api.utils.file_utils import filename_type, thumbnail_img
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class FileService(CommonService):
|
||||
@ -272,8 +273,8 @@ class FileService(CommonService):
|
||||
cls.delete_folder_by_pf_id(user_id, file.id)
|
||||
return cls.model.delete().where((cls.model.tenant_id == user_id)
|
||||
& (cls.model.id == folder_id)).execute(),
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception:
|
||||
logger.exception("delete_folder_by_pf_id")
|
||||
raise RuntimeError("Database error (File retrieval)!")
|
||||
|
||||
@classmethod
|
||||
@ -321,8 +322,8 @@ class FileService(CommonService):
|
||||
def move_file(cls, file_ids, folder_id):
|
||||
try:
|
||||
cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception:
|
||||
logger.exception("move_file")
|
||||
raise RuntimeError("Database error (File move)!")
|
||||
|
||||
@classmethod
|
||||
|
@ -14,12 +14,12 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
from api.db.services.user_service import TenantService
|
||||
from api.settings import database_logger
|
||||
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel, TTSModel
|
||||
from api.db import LLMType
|
||||
from api.db.db_models import DB
|
||||
from api.db.db_models import LLMFactories, LLM, TenantLLM
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class LLMFactoriesService(CommonService):
|
||||
@ -209,40 +209,40 @@ class LLMBundle(object):
|
||||
emd, used_tokens = self.mdl.encode(texts, batch_size)
|
||||
if not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, used_tokens):
|
||||
database_logger.error(
|
||||
"Can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
logger.error(
|
||||
"LLMBundle.encode can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
return emd, used_tokens
|
||||
|
||||
def encode_queries(self, query: str):
|
||||
emd, used_tokens = self.mdl.encode_queries(query)
|
||||
if not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, used_tokens):
|
||||
database_logger.error(
|
||||
"Can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
logger.error(
|
||||
"LLMBundle.encode_queries can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
return emd, used_tokens
|
||||
|
||||
def similarity(self, query: str, texts: list):
|
||||
sim, used_tokens = self.mdl.similarity(query, texts)
|
||||
if not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, used_tokens):
|
||||
database_logger.error(
|
||||
"Can't update token usage for {}/RERANK used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
logger.error(
|
||||
"LLMBundle.similarity can't update token usage for {}/RERANK used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
return sim, used_tokens
|
||||
|
||||
def describe(self, image, max_tokens=300):
|
||||
txt, used_tokens = self.mdl.describe(image, max_tokens)
|
||||
if not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, used_tokens):
|
||||
database_logger.error(
|
||||
"Can't update token usage for {}/IMAGE2TEXT used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
logger.error(
|
||||
"LLMBundle.describe can't update token usage for {}/IMAGE2TEXT used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
return txt
|
||||
|
||||
def transcription(self, audio):
|
||||
txt, used_tokens = self.mdl.transcription(audio)
|
||||
if not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, used_tokens):
|
||||
database_logger.error(
|
||||
"Can't update token usage for {}/SEQUENCE2TXT used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
logger.error(
|
||||
"LLMBundle.transcription can't update token usage for {}/SEQUENCE2TXT used_tokens: {}".format(self.tenant_id, used_tokens))
|
||||
return txt
|
||||
|
||||
def tts(self, text):
|
||||
@ -250,8 +250,8 @@ class LLMBundle(object):
|
||||
if isinstance(chunk,int):
|
||||
if not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, chunk, self.llm_name):
|
||||
database_logger.error(
|
||||
"Can't update token usage for {}/TTS".format(self.tenant_id))
|
||||
logger.error(
|
||||
"LLMBundle.tts can't update token usage for {}/TTS".format(self.tenant_id))
|
||||
return
|
||||
yield chunk
|
||||
|
||||
@ -259,8 +259,8 @@ class LLMBundle(object):
|
||||
txt, used_tokens = self.mdl.chat(system, history, gen_conf)
|
||||
if isinstance(txt, int) and not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, used_tokens, self.llm_name):
|
||||
database_logger.error(
|
||||
"Can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.llm_name, used_tokens))
|
||||
logger.error(
|
||||
"LLMBundle.chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.llm_name, used_tokens))
|
||||
return txt
|
||||
|
||||
def chat_streamly(self, system, history, gen_conf):
|
||||
@ -268,7 +268,7 @@ class LLMBundle(object):
|
||||
if isinstance(txt, int):
|
||||
if not TenantLLMService.increase_usage(
|
||||
self.tenant_id, self.llm_type, txt, self.llm_name):
|
||||
database_logger.error(
|
||||
"Can't update token usage for {}/CHAT llm_name: {}, content: {}".format(self.tenant_id, self.llm_name, txt))
|
||||
logger.error(
|
||||
"LLMBundle.chat_streamly can't update token usage for {}/CHAT llm_name: {}, content: {}".format(self.tenant_id, self.llm_name, txt))
|
||||
return
|
||||
yield txt
|
||||
|
@ -27,13 +27,10 @@ from api.apps import app
|
||||
from api.db.runtime_config import RuntimeConfig
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.settings import (
|
||||
HOST,
|
||||
HTTP_PORT,
|
||||
access_logger,
|
||||
database_logger,
|
||||
stat_logger,
|
||||
HOST, HTTP_PORT
|
||||
)
|
||||
from api import utils
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
from api.db.db_models import init_database_tables as init_web_db
|
||||
from api.db.init_data import init_web_data
|
||||
@ -45,23 +42,22 @@ def update_progress():
|
||||
time.sleep(3)
|
||||
try:
|
||||
DocumentService.update_progress()
|
||||
except Exception as e:
|
||||
stat_logger.error("update_progress exception:" + str(e))
|
||||
except Exception:
|
||||
logger.exception("update_progress exception")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(
|
||||
r"""
|
||||
if __name__ == '__main__':
|
||||
logger.info(r"""
|
||||
____ ___ ______ ______ __
|
||||
/ __ \ / | / ____// ____// /____ _ __
|
||||
/ /_/ // /| | / / __ / /_ / // __ \| | /| / /
|
||||
/ _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ /
|
||||
/_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/
|
||||
|
||||
""",
|
||||
flush=True,
|
||||
""")
|
||||
logger.info(
|
||||
f'project base: {utils.file_utils.get_project_base_directory()}'
|
||||
)
|
||||
stat_logger.info(f"project base: {utils.file_utils.get_project_base_directory()}")
|
||||
|
||||
# init db
|
||||
init_web_db()
|
||||
@ -83,7 +79,7 @@ if __name__ == "__main__":
|
||||
|
||||
RuntimeConfig.DEBUG = args.debug
|
||||
if RuntimeConfig.DEBUG:
|
||||
stat_logger.info("run on debug mode")
|
||||
logger.info("run on debug mode")
|
||||
|
||||
RuntimeConfig.init_env()
|
||||
RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT)
|
||||
@ -91,17 +87,17 @@ if __name__ == "__main__":
|
||||
peewee_logger = logging.getLogger("peewee")
|
||||
peewee_logger.propagate = False
|
||||
# rag_arch.common.log.ROpenHandler
|
||||
peewee_logger.addHandler(database_logger.handlers[0])
|
||||
peewee_logger.setLevel(database_logger.level)
|
||||
peewee_logger.addHandler(logger.handlers[0])
|
||||
peewee_logger.setLevel(logger.handlers[0].level)
|
||||
|
||||
thr = ThreadPoolExecutor(max_workers=1)
|
||||
thr.submit(update_progress)
|
||||
|
||||
# start http server
|
||||
try:
|
||||
stat_logger.info("RAG Flow http server start...")
|
||||
logger.info("RAG Flow http server start...")
|
||||
werkzeug_logger = logging.getLogger("werkzeug")
|
||||
for h in access_logger.handlers:
|
||||
for h in logger.handlers:
|
||||
werkzeug_logger.addHandler(h)
|
||||
run_simple(
|
||||
hostname=HOST,
|
||||
|
@ -17,24 +17,9 @@ import os
|
||||
from datetime import date
|
||||
from enum import IntEnum, Enum
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import LoggerFactory, getLogger
|
||||
import rag.utils.es_conn
|
||||
import rag.utils.infinity_conn
|
||||
|
||||
# Logger
|
||||
LoggerFactory.set_directory(
|
||||
os.path.join(
|
||||
get_project_base_directory(),
|
||||
"logs",
|
||||
"api"))
|
||||
# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
|
||||
LoggerFactory.LEVEL = 30
|
||||
|
||||
stat_logger = getLogger("stat")
|
||||
access_logger = getLogger("access")
|
||||
database_logger = getLogger("database")
|
||||
chat_logger = getLogger("chat")
|
||||
|
||||
import rag.utils
|
||||
from rag.nlp import search
|
||||
from graphrag import search as kg_search
|
||||
@ -47,8 +32,6 @@ TEMP_DIRECTORY = os.path.join(get_project_base_directory(), "temp")
|
||||
RAG_FLOW_CONF_PATH = os.path.join(get_project_base_directory(), "conf")
|
||||
LIGHTEN = int(os.environ.get('LIGHTEN', "0"))
|
||||
|
||||
SUBPROCESS_STD_LOG_NAME = "std.log"
|
||||
|
||||
ERROR_REPORT = True
|
||||
ERROR_REPORT_WITH_PATH = False
|
||||
|
||||
|
@ -35,11 +35,12 @@ from werkzeug.http import HTTP_STATUS_CODES
|
||||
from api.db.db_models import APIToken
|
||||
from api.settings import (
|
||||
REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC,
|
||||
stat_logger, CLIENT_AUTHENTICATION, HTTP_APP_KEY, SECRET_KEY
|
||||
CLIENT_AUTHENTICATION, HTTP_APP_KEY, SECRET_KEY
|
||||
)
|
||||
from api.settings import RetCode
|
||||
from api.utils import CustomJSONEncoder, get_uuid
|
||||
from api.utils import json_dumps
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
requests.models.complexjson.dumps = functools.partial(
|
||||
json.dumps, cls=CustomJSONEncoder)
|
||||
@ -117,7 +118,7 @@ def get_data_error_result(code=RetCode.DATA_ERROR,
|
||||
|
||||
|
||||
def server_error_response(e):
|
||||
stat_logger.exception(e)
|
||||
logger.exception(e)
|
||||
try:
|
||||
if e.code == 401:
|
||||
return get_json_result(code=401, message=repr(e))
|
||||
@ -258,7 +259,7 @@ def construct_json_result(code=RetCode.SUCCESS, message='success', data=None):
|
||||
|
||||
|
||||
def construct_error_response(e):
|
||||
stat_logger.exception(e)
|
||||
logger.exception(e)
|
||||
try:
|
||||
if e.code == 401:
|
||||
return construct_json_result(code=RetCode.UNAUTHORIZED, message=repr(e))
|
||||
|
@ -14,300 +14,38 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import os
|
||||
import typing
|
||||
import traceback
|
||||
import logging
|
||||
import inspect
|
||||
from logging.handlers import TimedRotatingFileHandler
|
||||
from threading import RLock
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
from api.utils import file_utils
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
|
||||
LOG_LEVEL = logging.INFO
|
||||
LOG_FILE = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"ragflow_{os.getpid()}.log"))
|
||||
LOG_FORMAT = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"
|
||||
logger = None
|
||||
|
||||
class LoggerFactory(object):
|
||||
TYPE = "FILE"
|
||||
LOG_FORMAT = "[%(levelname)s] [%(asctime)s] [%(module)s.%(funcName)s] [line:%(lineno)d]: %(message)s"
|
||||
logging.basicConfig(format=LOG_FORMAT)
|
||||
LEVEL = logging.DEBUG
|
||||
logger_dict = {}
|
||||
global_handler_dict = {}
|
||||
|
||||
LOG_DIR = None
|
||||
PARENT_LOG_DIR = None
|
||||
log_share = True
|
||||
|
||||
append_to_parent_log = None
|
||||
|
||||
lock = RLock()
|
||||
# CRITICAL = 50
|
||||
# FATAL = CRITICAL
|
||||
# ERROR = 40
|
||||
# WARNING = 30
|
||||
# WARN = WARNING
|
||||
# INFO = 20
|
||||
# DEBUG = 10
|
||||
# NOTSET = 0
|
||||
levels = (10, 20, 30, 40)
|
||||
schedule_logger_dict = {}
|
||||
|
||||
@staticmethod
|
||||
def set_directory(directory=None, parent_log_dir=None,
|
||||
append_to_parent_log=None, force=False):
|
||||
if parent_log_dir:
|
||||
LoggerFactory.PARENT_LOG_DIR = parent_log_dir
|
||||
if append_to_parent_log:
|
||||
LoggerFactory.append_to_parent_log = append_to_parent_log
|
||||
with LoggerFactory.lock:
|
||||
if not directory:
|
||||
directory = file_utils.get_project_base_directory("logs")
|
||||
if not LoggerFactory.LOG_DIR or force:
|
||||
LoggerFactory.LOG_DIR = directory
|
||||
if LoggerFactory.log_share:
|
||||
oldmask = os.umask(000)
|
||||
os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True)
|
||||
os.umask(oldmask)
|
||||
else:
|
||||
os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True)
|
||||
for loggerName, ghandler in LoggerFactory.global_handler_dict.items():
|
||||
for className, (logger,
|
||||
handler) in LoggerFactory.logger_dict.items():
|
||||
logger.removeHandler(ghandler)
|
||||
ghandler.close()
|
||||
LoggerFactory.global_handler_dict = {}
|
||||
for className, (logger,
|
||||
handler) in LoggerFactory.logger_dict.items():
|
||||
logger.removeHandler(handler)
|
||||
_handler = None
|
||||
if handler:
|
||||
handler.close()
|
||||
if className != "default":
|
||||
_handler = LoggerFactory.get_handler(className)
|
||||
logger.addHandler(_handler)
|
||||
LoggerFactory.assemble_global_handler(logger)
|
||||
LoggerFactory.logger_dict[className] = logger, _handler
|
||||
|
||||
@staticmethod
|
||||
def new_logger(name):
|
||||
logger = logging.getLogger(name)
|
||||
logger.propagate = False
|
||||
logger.setLevel(LoggerFactory.LEVEL)
|
||||
def getLogger():
|
||||
global logger
|
||||
if logger is not None:
|
||||
return logger
|
||||
|
||||
@staticmethod
|
||||
def get_logger(class_name=None):
|
||||
with LoggerFactory.lock:
|
||||
if class_name in LoggerFactory.logger_dict.keys():
|
||||
logger, handler = LoggerFactory.logger_dict[class_name]
|
||||
if not logger:
|
||||
logger, handler = LoggerFactory.init_logger(class_name)
|
||||
else:
|
||||
logger, handler = LoggerFactory.init_logger(class_name)
|
||||
return logger
|
||||
print(f"log file path: {LOG_FILE}")
|
||||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||
logger = logging.getLogger("ragflow")
|
||||
logger.setLevel(LOG_LEVEL)
|
||||
|
||||
@staticmethod
|
||||
def get_global_handler(logger_name, level=None, log_dir=None):
|
||||
if not LoggerFactory.LOG_DIR:
|
||||
return logging.StreamHandler()
|
||||
if log_dir:
|
||||
logger_name_key = logger_name + "_" + log_dir
|
||||
else:
|
||||
logger_name_key = logger_name + "_" + LoggerFactory.LOG_DIR
|
||||
# if loggerName not in LoggerFactory.globalHandlerDict:
|
||||
if logger_name_key not in LoggerFactory.global_handler_dict:
|
||||
with LoggerFactory.lock:
|
||||
if logger_name_key not in LoggerFactory.global_handler_dict:
|
||||
handler = LoggerFactory.get_handler(
|
||||
logger_name, level, log_dir)
|
||||
LoggerFactory.global_handler_dict[logger_name_key] = handler
|
||||
return LoggerFactory.global_handler_dict[logger_name_key]
|
||||
handler1 = RotatingFileHandler(LOG_FILE, maxBytes=10*1024*1024, backupCount=5)
|
||||
handler1.setLevel(LOG_LEVEL)
|
||||
formatter1 = logging.Formatter(LOG_FORMAT)
|
||||
handler1.setFormatter(formatter1)
|
||||
logger.addHandler(handler1)
|
||||
|
||||
@staticmethod
|
||||
def get_handler(class_name, level=None, log_dir=None,
|
||||
log_type=None, job_id=None):
|
||||
if not log_type:
|
||||
if not LoggerFactory.LOG_DIR or not class_name:
|
||||
return logging.StreamHandler()
|
||||
# return Diy_StreamHandler()
|
||||
handler2 = logging.StreamHandler()
|
||||
handler2.setLevel(LOG_LEVEL)
|
||||
formatter2 = logging.Formatter(LOG_FORMAT)
|
||||
handler2.setFormatter(formatter2)
|
||||
logger.addHandler(handler2)
|
||||
|
||||
if not log_dir:
|
||||
log_file = os.path.join(
|
||||
LoggerFactory.LOG_DIR,
|
||||
"{}.log".format(class_name))
|
||||
else:
|
||||
log_file = os.path.join(log_dir, "{}.log".format(class_name))
|
||||
else:
|
||||
log_file = os.path.join(log_dir, "rag_flow_{}.log".format(
|
||||
log_type) if level == LoggerFactory.LEVEL else 'rag_flow_{}_error.log'.format(log_type))
|
||||
|
||||
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
||||
if LoggerFactory.log_share:
|
||||
handler = ROpenHandler(log_file,
|
||||
when='D',
|
||||
interval=1,
|
||||
backupCount=14,
|
||||
delay=True)
|
||||
else:
|
||||
handler = TimedRotatingFileHandler(log_file,
|
||||
when='D',
|
||||
interval=1,
|
||||
backupCount=14,
|
||||
delay=True)
|
||||
if level:
|
||||
handler.level = level
|
||||
|
||||
return handler
|
||||
|
||||
@staticmethod
|
||||
def init_logger(class_name):
|
||||
with LoggerFactory.lock:
|
||||
logger = LoggerFactory.new_logger(class_name)
|
||||
handler = None
|
||||
if class_name:
|
||||
handler = LoggerFactory.get_handler(class_name)
|
||||
logger.addHandler(handler)
|
||||
LoggerFactory.logger_dict[class_name] = logger, handler
|
||||
|
||||
else:
|
||||
LoggerFactory.logger_dict["default"] = logger, handler
|
||||
|
||||
LoggerFactory.assemble_global_handler(logger)
|
||||
return logger, handler
|
||||
|
||||
@staticmethod
|
||||
def assemble_global_handler(logger):
|
||||
if LoggerFactory.LOG_DIR:
|
||||
for level in LoggerFactory.levels:
|
||||
if level >= LoggerFactory.LEVEL:
|
||||
level_logger_name = logging._levelToName[level]
|
||||
logger.addHandler(
|
||||
LoggerFactory.get_global_handler(
|
||||
level_logger_name, level))
|
||||
if LoggerFactory.append_to_parent_log and LoggerFactory.PARENT_LOG_DIR:
|
||||
for level in LoggerFactory.levels:
|
||||
if level >= LoggerFactory.LEVEL:
|
||||
level_logger_name = logging._levelToName[level]
|
||||
logger.addHandler(
|
||||
LoggerFactory.get_global_handler(level_logger_name, level, LoggerFactory.PARENT_LOG_DIR))
|
||||
|
||||
|
||||
def setDirectory(directory=None):
|
||||
LoggerFactory.set_directory(directory)
|
||||
|
||||
|
||||
def setLevel(level):
|
||||
LoggerFactory.LEVEL = level
|
||||
|
||||
|
||||
def getLogger(className=None, useLevelFile=False):
|
||||
if className is None:
|
||||
frame = inspect.stack()[1]
|
||||
module = inspect.getmodule(frame[0])
|
||||
className = 'stat'
|
||||
return LoggerFactory.get_logger(className)
|
||||
|
||||
|
||||
def exception_to_trace_string(ex):
|
||||
return "".join(traceback.TracebackException.from_exception(ex).format())
|
||||
|
||||
|
||||
class ROpenHandler(TimedRotatingFileHandler):
|
||||
def _open(self):
|
||||
prevumask = os.umask(000)
|
||||
rtv = TimedRotatingFileHandler._open(self)
|
||||
os.umask(prevumask)
|
||||
return rtv
|
||||
|
||||
|
||||
def sql_logger(job_id='', log_type='sql'):
|
||||
key = job_id + log_type
|
||||
if key in LoggerFactory.schedule_logger_dict.keys():
|
||||
return LoggerFactory.schedule_logger_dict[key]
|
||||
return get_job_logger(job_id=job_id, log_type=log_type)
|
||||
|
||||
|
||||
def ready_log(msg, job=None, task=None, role=None, party_id=None, detail=None):
|
||||
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
||||
return f"{prefix}{msg} ready{suffix}"
|
||||
|
||||
|
||||
def start_log(msg, job=None, task=None, role=None, party_id=None, detail=None):
|
||||
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
||||
return f"{prefix}start to {msg}{suffix}"
|
||||
|
||||
|
||||
def successful_log(msg, job=None, task=None, role=None,
|
||||
party_id=None, detail=None):
|
||||
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
||||
return f"{prefix}{msg} successfully{suffix}"
|
||||
|
||||
|
||||
def warning_log(msg, job=None, task=None, role=None,
|
||||
party_id=None, detail=None):
|
||||
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
||||
return f"{prefix}{msg} is not effective{suffix}"
|
||||
|
||||
|
||||
def failed_log(msg, job=None, task=None, role=None,
|
||||
party_id=None, detail=None):
|
||||
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
||||
return f"{prefix}failed to {msg}{suffix}"
|
||||
|
||||
|
||||
def base_msg(job=None, task=None, role: str = None,
|
||||
party_id: typing.Union[str, int] = None, detail=None):
|
||||
if detail:
|
||||
detail_msg = f" detail: \n{detail}"
|
||||
else:
|
||||
detail_msg = ""
|
||||
if task is not None:
|
||||
return f"task {task.f_task_id} {task.f_task_version} ", f" on {task.f_role} {task.f_party_id}{detail_msg}"
|
||||
elif job is not None:
|
||||
return "", f" on {job.f_role} {job.f_party_id}{detail_msg}"
|
||||
elif role and party_id:
|
||||
return "", f" on {role} {party_id}{detail_msg}"
|
||||
else:
|
||||
return "", f"{detail_msg}"
|
||||
|
||||
|
||||
def exception_to_trace_string(ex):
|
||||
return "".join(traceback.TracebackException.from_exception(ex).format())
|
||||
|
||||
|
||||
def get_logger_base_dir():
|
||||
job_log_dir = file_utils.get_rag_flow_directory('logs')
|
||||
return job_log_dir
|
||||
|
||||
|
||||
def get_job_logger(job_id, log_type):
|
||||
rag_flow_log_dir = file_utils.get_rag_flow_directory('logs', 'rag_flow')
|
||||
job_log_dir = file_utils.get_rag_flow_directory('logs', job_id)
|
||||
if not job_id:
|
||||
log_dirs = [rag_flow_log_dir]
|
||||
else:
|
||||
if log_type == 'audit':
|
||||
log_dirs = [job_log_dir, rag_flow_log_dir]
|
||||
else:
|
||||
log_dirs = [job_log_dir]
|
||||
if LoggerFactory.log_share:
|
||||
oldmask = os.umask(000)
|
||||
os.makedirs(job_log_dir, exist_ok=True)
|
||||
os.makedirs(rag_flow_log_dir, exist_ok=True)
|
||||
os.umask(oldmask)
|
||||
else:
|
||||
os.makedirs(job_log_dir, exist_ok=True)
|
||||
os.makedirs(rag_flow_log_dir, exist_ok=True)
|
||||
logger = LoggerFactory.new_logger(f"{job_id}_{log_type}")
|
||||
for job_log_dir in log_dirs:
|
||||
handler = LoggerFactory.get_handler(class_name=None, level=LoggerFactory.LEVEL,
|
||||
log_dir=job_log_dir, log_type=log_type, job_id=job_id)
|
||||
error_handler = LoggerFactory.get_handler(
|
||||
class_name=None,
|
||||
level=logging.ERROR,
|
||||
log_dir=job_log_dir,
|
||||
log_type=log_type,
|
||||
job_id=job_id)
|
||||
logger.addHandler(handler)
|
||||
logger.addHandler(error_handler)
|
||||
with LoggerFactory.lock:
|
||||
LoggerFactory.schedule_logger_dict[job_id + log_type] = logger
|
||||
return logger
|
||||
|
||||
logger = getLogger()
|
||||
|
@ -19,13 +19,14 @@ from io import BytesIO
|
||||
import re
|
||||
import pdfplumber
|
||||
import logging
|
||||
from PIL import Image, ImageDraw
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
from timeit import default_timer as timer
|
||||
from pypdf import PdfReader as pdf2_read
|
||||
|
||||
from api.settings import LIGHTEN
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
||||
from rag.nlp import rag_tokenizer
|
||||
from copy import deepcopy
|
||||
@ -49,15 +50,15 @@ class RAGFlowPdfParser:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
except Exception:
|
||||
logger.exception("RAGFlowPdfParser __init__")
|
||||
try:
|
||||
model_dir = os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res/deepdoc")
|
||||
self.updown_cnt_mdl.load_model(os.path.join(
|
||||
model_dir, "updown_concat_xgb.model"))
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
model_dir = snapshot_download(
|
||||
repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
||||
local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
|
||||
@ -187,7 +188,7 @@ class RAGFlowPdfParser:
|
||||
return True
|
||||
|
||||
def _table_transformer_job(self, ZM):
|
||||
logging.info("Table processing...")
|
||||
logger.info("Table processing...")
|
||||
imgs, pos = [], []
|
||||
tbcnt = [0]
|
||||
MARGIN = 10
|
||||
@ -425,12 +426,12 @@ class RAGFlowPdfParser:
|
||||
detach_feats = [b["x1"] < b_["x0"],
|
||||
b["x0"] > b_["x1"]]
|
||||
if (any(feats) and not any(concatting_feats)) or any(detach_feats):
|
||||
print(
|
||||
logger.info("{} {} {} {}".format(
|
||||
b["text"],
|
||||
b_["text"],
|
||||
any(feats),
|
||||
any(concatting_feats),
|
||||
any(detach_feats))
|
||||
))
|
||||
i += 1
|
||||
continue
|
||||
# merge up and down
|
||||
@ -726,14 +727,14 @@ class RAGFlowPdfParser:
|
||||
# continue
|
||||
if tv < fv and tk:
|
||||
tables[tk].insert(0, c)
|
||||
logging.debug(
|
||||
logger.debug(
|
||||
"TABLE:" +
|
||||
self.boxes[i]["text"] +
|
||||
"; Cap: " +
|
||||
tk)
|
||||
elif fk:
|
||||
figures[fk].insert(0, c)
|
||||
logging.debug(
|
||||
logger.debug(
|
||||
"FIGURE:" +
|
||||
self.boxes[i]["text"] +
|
||||
"; Cap: " +
|
||||
@ -760,7 +761,7 @@ class RAGFlowPdfParser:
|
||||
if ii is not None:
|
||||
b = louts[ii]
|
||||
else:
|
||||
logging.warn(
|
||||
logger.warn(
|
||||
f"Missing layout match: {pn + 1},%s" %
|
||||
(bxs[0].get(
|
||||
"layoutno", "")))
|
||||
@ -918,8 +919,8 @@ class RAGFlowPdfParser:
|
||||
if usefull(boxes[0]):
|
||||
dfs(boxes[0], 0)
|
||||
else:
|
||||
logging.debug("WASTE: " + boxes[0]["text"])
|
||||
except Exception as e:
|
||||
logger.debug("WASTE: " + boxes[0]["text"])
|
||||
except Exception:
|
||||
pass
|
||||
boxes.pop(0)
|
||||
mw = np.mean(widths)
|
||||
@ -927,7 +928,7 @@ class RAGFlowPdfParser:
|
||||
res.append(
|
||||
"\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
|
||||
else:
|
||||
logging.debug("REMOVED: " +
|
||||
logger.debug("REMOVED: " +
|
||||
"<<".join([c["text"] for c in lines]))
|
||||
|
||||
return "\n\n".join(res)
|
||||
@ -938,8 +939,8 @@ class RAGFlowPdfParser:
|
||||
pdf = pdfplumber.open(
|
||||
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
||||
return len(pdf.pages)
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
except Exception:
|
||||
logger.exception("total_page_number")
|
||||
|
||||
def __images__(self, fnm, zoomin=3, page_from=0,
|
||||
page_to=299, callback=None):
|
||||
@ -962,8 +963,8 @@ class RAGFlowPdfParser:
|
||||
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
|
||||
self.pdf.pages[page_from:page_to]]
|
||||
self.total_page = len(self.pdf.pages)
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
except Exception:
|
||||
logger.exception("RAGFlowPdfParser __images__")
|
||||
|
||||
self.outlines = []
|
||||
try:
|
||||
@ -979,11 +980,11 @@ class RAGFlowPdfParser:
|
||||
|
||||
dfs(outlines, 0)
|
||||
except Exception as e:
|
||||
logging.warning(f"Outlines exception: {e}")
|
||||
logger.warning(f"Outlines exception: {e}")
|
||||
if not self.outlines:
|
||||
logging.warning(f"Miss outlines")
|
||||
logger.warning("Miss outlines")
|
||||
|
||||
logging.info("Images converted.")
|
||||
logger.info("Images converted.")
|
||||
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
|
||||
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
|
||||
range(len(self.page_chars))]
|
||||
@ -1023,7 +1024,7 @@ class RAGFlowPdfParser:
|
||||
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
|
||||
"".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
|
||||
|
||||
logging.info("Is it English:", self.is_english)
|
||||
logger.info("Is it English:", self.is_english)
|
||||
|
||||
self.page_cum_height = np.cumsum(self.page_cum_height)
|
||||
assert len(self.page_cum_height) == len(self.page_images) + 1
|
||||
@ -1162,10 +1163,10 @@ class PlainParser(object):
|
||||
dfs(a, depth + 1)
|
||||
|
||||
dfs(outlines, 0)
|
||||
except Exception as e:
|
||||
logging.warning(f"Outlines exception: {e}")
|
||||
except Exception:
|
||||
logger.exception("Outlines exception")
|
||||
if not self.outlines:
|
||||
logging.warning(f"Miss outlines")
|
||||
logger.warning("Miss outlines")
|
||||
|
||||
return [(l, "") for l in lines], []
|
||||
|
||||
|
@ -11,10 +11,15 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re,json,os
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
import pandas as pd
|
||||
from rag.nlp import rag_tokenizer
|
||||
from . import regions
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
||||
GOODS["cid"] = GOODS["cid"].astype(str)
|
||||
@ -27,7 +32,7 @@ def baike(cid, default_v=0):
|
||||
global GOODS
|
||||
try:
|
||||
return GOODS.loc[str(cid), "len"]
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
return default_v
|
||||
|
||||
@ -65,7 +70,8 @@ def rmNoise(n):
|
||||
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
|
||||
for c,v in CORP_TAG.items():
|
||||
cc = corpNorm(rmNoise(c), False)
|
||||
if not cc: print (c)
|
||||
if not cc:
|
||||
logger.info(c)
|
||||
CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
|
||||
|
||||
def is_good(nm):
|
||||
|
@ -11,13 +11,19 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re, copy, time, datetime, demjson3, \
|
||||
traceback, signal
|
||||
import re
|
||||
import copy
|
||||
import time
|
||||
import datetime
|
||||
import demjson3
|
||||
import traceback
|
||||
import signal
|
||||
import numpy as np
|
||||
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
||||
from rag.nlp import rag_tokenizer, surname
|
||||
from xpinyin import Pinyin
|
||||
from contextlib import contextmanager
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class TimeoutException(Exception): pass
|
||||
@ -79,7 +85,7 @@ def forEdu(cv):
|
||||
y, m, d = getYMD(dt)
|
||||
st_dt.append(str(y))
|
||||
e["start_dt_kwd"] = str(y)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
r = schools.select(n.get("school_name", ""))
|
||||
@ -158,7 +164,7 @@ def forEdu(cv):
|
||||
y, m, d = getYMD(edu_end_dt)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt"))
|
||||
logger.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
|
||||
if sch:
|
||||
cv["school_name_kwd"] = sch
|
||||
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
|
||||
@ -233,7 +239,7 @@ def forWork(cv):
|
||||
if type(n) == type(""):
|
||||
try:
|
||||
n = json_loads(n)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
|
||||
@ -269,8 +275,8 @@ def forWork(cv):
|
||||
|
||||
try:
|
||||
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
|
||||
except Exception as e:
|
||||
print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time"))
|
||||
except Exception:
|
||||
logger.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
|
||||
|
||||
if n.get("scale"):
|
||||
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
||||
@ -327,7 +333,7 @@ def forWork(cv):
|
||||
y, m, d = getYMD(work_st_tm)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt"))
|
||||
logger.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
|
||||
|
||||
cv["job_num_int"] = 0
|
||||
if duas:
|
||||
@ -457,8 +463,8 @@ def parse(cv):
|
||||
t = k[:-4]
|
||||
cv[f"{t}_kwd"] = nms
|
||||
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
||||
except Exception as e:
|
||||
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
||||
except Exception:
|
||||
logger.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
|
||||
cv[k] = []
|
||||
|
||||
# tokenize fields
|
||||
@ -524,7 +530,7 @@ def parse(cv):
|
||||
if not y: y = "2012"
|
||||
if not m: m = "01"
|
||||
if not d: d = "01"
|
||||
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
# long text tokenize
|
||||
|
||||
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
||||
@ -556,10 +562,10 @@ def parse(cv):
|
||||
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
|
||||
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
|
||||
y, m, d = getYMD(str(cv["work_start_time"]))
|
||||
cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
||||
except Exception as e:
|
||||
print("【EXCEPTION】", e, "==>", cv.get("work_start_time"))
|
||||
logger.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
|
||||
if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
||||
|
||||
keys = list(cv.keys())
|
||||
@ -574,7 +580,7 @@ def parse(cv):
|
||||
|
||||
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
||||
cv["id"] = cv["tob_resume_id"]
|
||||
print("CCCCCCCCCCCCCCC")
|
||||
logger.info("CCCCCCCCCCCCCCC")
|
||||
|
||||
return dealWithInt64(cv)
|
||||
|
||||
@ -589,4 +595,3 @@ def dealWithInt64(d):
|
||||
|
||||
if isinstance(d, np.integer): d = int(d)
|
||||
return d
|
||||
|
||||
|
@ -20,6 +20,7 @@ import cv2
|
||||
import numpy as np
|
||||
import math
|
||||
from PIL import Image
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class DecodeImage(object):
|
||||
@ -402,7 +403,7 @@ class DetResizeForTest(object):
|
||||
return None, (None, None)
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
except BaseException:
|
||||
print(img.shape, resize_w, resize_h)
|
||||
logger.exception("{} {} {}".format(img.shape, resize_w, resize_h))
|
||||
sys.exit(0)
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
@ -452,7 +453,6 @@ class E2EResizeForTest(object):
|
||||
return data
|
||||
|
||||
def resize_image_for_totaltext(self, im, max_side_len=512):
|
||||
|
||||
h, w, _ = im.shape
|
||||
resize_w = w
|
||||
resize_h = h
|
||||
|
@ -19,6 +19,7 @@ from huggingface_hub import snapshot_download
|
||||
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from .operators import *
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Recognizer(object):
|
||||
@ -439,7 +440,7 @@ class Recognizer(object):
|
||||
end_index = min((i + 1) * batch_size, len(imgs))
|
||||
batch_image_list = imgs[start_index:end_index]
|
||||
inputs = self.preprocess(batch_image_list)
|
||||
print("preprocess")
|
||||
logger.info("preprocess")
|
||||
for ins in inputs:
|
||||
bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr)
|
||||
res.append(bb)
|
||||
|
@ -14,6 +14,7 @@
|
||||
import os
|
||||
import PIL
|
||||
from PIL import ImageDraw
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
def save_results(image_list, results, labels, output_dir='output/', threshold=0.5):
|
||||
@ -24,7 +25,7 @@ def save_results(image_list, results, labels, output_dir='output/', threshold=0.
|
||||
|
||||
out_path = os.path.join(output_dir, f"{idx}.jpg")
|
||||
im.save(out_path, quality=95)
|
||||
print("save result to: " + out_path)
|
||||
logger.info("save result to: " + out_path)
|
||||
|
||||
|
||||
def draw_box(im, result, lables, threshold=0.5):
|
||||
|
@ -10,7 +10,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import os, sys
|
||||
import os
|
||||
import sys
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
sys.path.insert(
|
||||
0,
|
||||
os.path.abspath(
|
||||
@ -56,7 +59,7 @@ def main(args):
|
||||
} for t in lyt]
|
||||
img = draw_box(images[i], lyt, labels, float(args.threshold))
|
||||
img.save(outputs[i], quality=95)
|
||||
print("save result to: " + outputs[i])
|
||||
logger.info("save result to: " + outputs[i])
|
||||
|
||||
|
||||
def get_table_html(img, tb_cpns, ocr):
|
||||
|
@ -7,7 +7,6 @@ Reference:
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
@ -18,12 +17,12 @@ import tiktoken
|
||||
from graphrag.claim_prompt import CLAIM_EXTRACTION_PROMPT, CONTINUE_PROMPT, LOOP_PROMPT
|
||||
from rag.llm.chat_model import Base as CompletionLLM
|
||||
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
DEFAULT_TUPLE_DELIMITER = "<|>"
|
||||
DEFAULT_RECORD_DELIMITER = "##"
|
||||
DEFAULT_COMPLETION_DELIMITER = "<|COMPLETE|>"
|
||||
CLAIM_MAX_GLEANINGS = 1
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -127,7 +126,7 @@ class ClaimExtractor:
|
||||
]
|
||||
source_doc_map[document_id] = text
|
||||
except Exception as e:
|
||||
log.exception("error extracting claim")
|
||||
logger.exception("error extracting claim")
|
||||
self._on_error(
|
||||
e,
|
||||
traceback.format_exc(),
|
||||
@ -266,4 +265,4 @@ if __name__ == "__main__":
|
||||
"claim_description": ""
|
||||
}
|
||||
claim = ex(info)
|
||||
print(json.dumps(claim.output, ensure_ascii=False, indent=2))
|
||||
logger.info(json.dumps(claim.output, ensure_ascii=False, indent=2))
|
||||
|
@ -6,11 +6,10 @@ Reference:
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, List, Callable
|
||||
from typing import List, Callable
|
||||
import networkx as nx
|
||||
import pandas as pd
|
||||
from graphrag import leiden
|
||||
@ -20,8 +19,7 @@ from rag.llm.chat_model import Base as CompletionLLM
|
||||
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements, dict_has_keys_with_types
|
||||
from rag.utils import num_tokens_from_string
|
||||
from timeit import default_timer as timer
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -82,7 +80,7 @@ class CommunityReportsExtractor:
|
||||
response = re.sub(r"[^\}]*$", "", response)
|
||||
response = re.sub(r"\{\{", "{", response)
|
||||
response = re.sub(r"\}\}", "}", response)
|
||||
print(response)
|
||||
logger.info(response)
|
||||
response = json.loads(response)
|
||||
if not dict_has_keys_with_types(response, [
|
||||
("title", str),
|
||||
@ -94,7 +92,7 @@ class CommunityReportsExtractor:
|
||||
response["weight"] = weight
|
||||
response["entities"] = ents
|
||||
except Exception as e:
|
||||
print("ERROR: ", traceback.format_exc())
|
||||
logger.exception("CommunityReportsExtractor got exception")
|
||||
self._on_error(e, traceback.format_exc(), None)
|
||||
continue
|
||||
|
||||
@ -127,5 +125,4 @@ class CommunityReportsExtractor:
|
||||
report_sections = "\n\n".join(
|
||||
f"## {finding_summary(f)}\n\n{finding_explanation(f)}" for f in findings
|
||||
)
|
||||
|
||||
return f"# {title}\n\n{summary}\n\n{report_sections}"
|
||||
|
@ -28,6 +28,7 @@ from graphrag.graph_extractor import GraphExtractor, DEFAULT_ENTITY_TYPES
|
||||
from graphrag.mind_map_extractor import MindMapExtractor
|
||||
from rag.nlp import rag_tokenizer
|
||||
from rag.utils import num_tokens_from_string
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
def graph_merge(g1, g2):
|
||||
@ -94,7 +95,7 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: List[str], callback, en
|
||||
chunks = []
|
||||
for n, attr in graph.nodes(data=True):
|
||||
if attr.get("rank", 0) == 0:
|
||||
print(f"Ignore entity: {n}")
|
||||
logger.info(f"Ignore entity: {n}")
|
||||
continue
|
||||
chunk = {
|
||||
"name_kwd": n,
|
||||
@ -136,7 +137,7 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: List[str], callback, en
|
||||
mg = mindmap(_chunks).output
|
||||
if not len(mg.keys()): return chunks
|
||||
|
||||
print(json.dumps(mg, ensure_ascii=False, indent=2))
|
||||
logger.info(json.dumps(mg, ensure_ascii=False, indent=2))
|
||||
chunks.append(
|
||||
{
|
||||
"content_with_weight": json.dumps(mg, ensure_ascii=False, indent=2),
|
||||
|
@ -18,7 +18,6 @@ import collections
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
@ -30,6 +29,7 @@ from rag.llm.chat_model import Base as CompletionLLM
|
||||
import markdown_to_json
|
||||
from functools import reduce
|
||||
from rag.utils import num_tokens_from_string
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -193,6 +193,6 @@ class MindMapExtractor:
|
||||
gen_conf = {"temperature": 0.5}
|
||||
response = self._llm.chat(text, [{"role": "user", "content": "Output:"}], gen_conf)
|
||||
response = re.sub(r"```[^\n]*", "", response)
|
||||
print(response)
|
||||
print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
|
||||
logger.info(response)
|
||||
logger.info(self._todict(markdown_to_json.dictify(response)))
|
||||
return self._todict(markdown_to_json.dictify(response))
|
||||
|
@ -2,7 +2,7 @@ import requests
|
||||
from bridge.context import ContextType # Import Context, ContextType
|
||||
from bridge.reply import Reply, ReplyType # Import Reply, ReplyType
|
||||
from bridge import *
|
||||
from common.log import logger
|
||||
from api.utils.log_utils import logger
|
||||
from plugins import Plugin, register # Import Plugin and register
|
||||
from plugins.event import Event, EventContext, EventAction # Import event-related classes
|
||||
|
||||
@ -76,7 +76,7 @@ class RAGFlowChat(Plugin):
|
||||
logger.error(f"[RAGFlowChat] HTTP error when creating conversation: {response.status_code}")
|
||||
return f"Sorry, unable to connect to RAGFlow API (create conversation). HTTP status code: {response.status_code}"
|
||||
except Exception as e:
|
||||
logger.exception(f"[RAGFlowChat] Exception when creating conversation: {e}")
|
||||
logger.exception("[RAGFlowChat] Exception when creating conversation")
|
||||
return f"Sorry, an internal error occurred: {str(e)}"
|
||||
|
||||
# Step 2: Send the message and get a reply
|
||||
@ -108,5 +108,5 @@ class RAGFlowChat(Plugin):
|
||||
logger.error(f"[RAGFlowChat] HTTP error when getting answer: {response.status_code}")
|
||||
return f"Sorry, unable to connect to RAGFlow API (get reply). HTTP status code: {response.status_code}"
|
||||
except Exception as e:
|
||||
logger.exception(f"[RAGFlowChat] Exception when getting answer: {e}")
|
||||
logger.exception("[RAGFlowChat] Exception when getting answer")
|
||||
return f"Sorry, an internal error occurred: {str(e)}"
|
||||
|
@ -20,6 +20,7 @@ from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
||||
tokenize_chunks
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -38,7 +39,7 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.67, "Layout analysis finished")
|
||||
print("layouts:", timer() - start)
|
||||
logger.info("layouts: {}".format(timer() - start))
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.68, "Table analysis finished")
|
||||
self._text_merge()
|
||||
|
@ -18,7 +18,7 @@ import re
|
||||
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
||||
from deepdoc.parser import HtmlParser, TxtParser
|
||||
from timeit import default_timer as timer
|
||||
from rag.settings import cron_logger
|
||||
from api.utils.log_utils import logger
|
||||
import io
|
||||
|
||||
|
||||
@ -86,7 +86,7 @@ def chunk(
|
||||
)
|
||||
|
||||
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
||||
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
# get the attachment info
|
||||
for part in msg.iter_attachments():
|
||||
content_disposition = part.get("Content-Disposition")
|
||||
|
@ -21,7 +21,7 @@ from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge,
|
||||
make_colon_as_title, tokenize_chunks, docx_question_level
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
||||
from rag.settings import cron_logger
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Docx(DocxParser):
|
||||
@ -122,8 +122,8 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.67, "Layout analysis finished")
|
||||
cron_logger.info("layouts:".format(
|
||||
(timer() - start) / (self.total_page + 0.1)))
|
||||
logger.info("layouts:".format(
|
||||
))
|
||||
self._naive_vertical_merge()
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
@ -24,6 +24,7 @@ from rag.utils import num_tokens_from_string
|
||||
from deepdoc.parser import PdfParser, PlainParser, DocxParser
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -47,11 +48,11 @@ class Pdf(PdfParser):
|
||||
# for bb in self.boxes:
|
||||
# for b in bb:
|
||||
# print(b)
|
||||
print("OCR:", timer() - start)
|
||||
logger.info("OCR: {}".format(timer() - start))
|
||||
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.65, "Layout analysis finished.")
|
||||
print("layouts:", timer() - start)
|
||||
logger.info("layouts: {}".format(timer() - start))
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.67, "Table analysis finished.")
|
||||
self._text_merge()
|
||||
|
@ -19,7 +19,7 @@ from deepdoc.parser.pdf_parser import PlainParser
|
||||
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
|
||||
naive_merge_docx, tokenize_chunks_docx
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
|
||||
from rag.settings import cron_logger
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils import num_tokens_from_string
|
||||
from PIL import Image
|
||||
from functools import reduce
|
||||
@ -41,18 +41,18 @@ class Docx(DocxParser):
|
||||
try:
|
||||
image_blob = related_part.image.blob
|
||||
except UnrecognizedImageError:
|
||||
print("Unrecognized image format. Skipping image.")
|
||||
logger.info("Unrecognized image format. Skipping image.")
|
||||
return None
|
||||
except UnexpectedEndOfFileError:
|
||||
print("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
||||
logger.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
||||
return None
|
||||
except InvalidImageStreamError:
|
||||
print("The recognized image stream appears to be corrupted. Skipping image.")
|
||||
logger.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||
return None
|
||||
try:
|
||||
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
||||
return image
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def __clean(self, line):
|
||||
@ -133,7 +133,7 @@ class Pdf(PdfParser):
|
||||
callback
|
||||
)
|
||||
callback(msg="OCR finished")
|
||||
cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
||||
logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
||||
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
@ -147,7 +147,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward()
|
||||
# self._filter_forpages()
|
||||
|
||||
cron_logger.info("layouts: {}".format(timer() - start))
|
||||
logger.info("layouts cost: {}s".format(timer() - start))
|
||||
return [(b["text"], self._line_tag(b, zoomin))
|
||||
for b in self.boxes], tbls
|
||||
|
||||
@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return chunks
|
||||
|
||||
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
|
||||
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
@ -280,7 +280,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return chunks
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
return res
|
||||
|
||||
|
||||
|
@ -18,6 +18,7 @@ from deepdoc.parser.utils import get_text
|
||||
from rag.app import laws
|
||||
from rag.nlp import rag_tokenizer, tokenize
|
||||
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -37,7 +38,7 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin, drop=False)
|
||||
callback(0.63, "Layout analysis finished.")
|
||||
print("layouts:", timer() - start)
|
||||
logger.info("layouts cost: {}s".format(timer() - start))
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.65, "Table analysis finished.")
|
||||
self._text_merge()
|
||||
|
@ -17,6 +17,7 @@ from api.db import ParserType
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||
from deepdoc.parser import PdfParser, PlainParser
|
||||
import numpy as np
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -40,7 +41,7 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.63, "Layout analysis finished")
|
||||
print("layouts:", timer() - start)
|
||||
logger.info(f"layouts cost: {timer() - start}s")
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.68, "Table analysis finished")
|
||||
self._text_merge()
|
||||
@ -52,8 +53,8 @@ class Pdf(PdfParser):
|
||||
|
||||
# clean mess
|
||||
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
||||
print("two_column...................", column_width,
|
||||
self.page_images[0].size[0] / zoomin / 2)
|
||||
logger.info("two_column................... {} {}".format(column_width,
|
||||
self.page_images[0].size[0] / zoomin / 2))
|
||||
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
|
||||
for b in self.boxes:
|
||||
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
||||
@ -114,8 +115,8 @@ class Pdf(PdfParser):
|
||||
from_page, min(
|
||||
to_page, self.total_page)))
|
||||
for b in self.boxes:
|
||||
print(b["text"], b.get("layoutno"))
|
||||
print(tbls)
|
||||
logger.info("{} {}".format(b["text"], b.get("layoutno")))
|
||||
logger.info("{}".format(tbls))
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
@ -156,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
print("It's English.....", eng)
|
||||
logger.info("It's English.....{}".format(eng))
|
||||
|
||||
res = tokenize_table(paper["tables"], doc, eng)
|
||||
|
||||
@ -183,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
print(lvl, sorted_sections[i][0], most_level, sid)
|
||||
logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
|
||||
|
||||
chunks = []
|
||||
last_sid = -2
|
||||
|
@ -19,7 +19,7 @@ from openpyxl import load_workbook
|
||||
from deepdoc.parser.utils import get_text
|
||||
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
|
||||
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
|
||||
from rag.settings import cron_logger
|
||||
from api.utils.log_utils import logger
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
@ -82,7 +82,7 @@ class Pdf(PdfParser):
|
||||
callback
|
||||
)
|
||||
callback(msg="OCR finished")
|
||||
cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
||||
logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin, drop=False)
|
||||
callback(0.63, "Layout analysis finished.")
|
||||
@ -94,7 +94,7 @@ class Pdf(PdfParser):
|
||||
#self._naive_vertical_merge()
|
||||
# self._concat_downward()
|
||||
#self._filter_forpages()
|
||||
cron_logger.info("layouts: {}".format(timer() - start))
|
||||
logger.info("layouts: {}".format(timer() - start))
|
||||
sections = [b["text"] for b in self.boxes]
|
||||
bull_x0_list = []
|
||||
q_bull, reg = qbullets_category(sections)
|
||||
|
@ -14,14 +14,13 @@ import base64
|
||||
import datetime
|
||||
import json
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser.resume import refactor
|
||||
from deepdoc.parser.resume import step_one, step_two
|
||||
from rag.settings import cron_logger
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils import rmSpace
|
||||
|
||||
forbidden_select_fields4resume = [
|
||||
@ -64,8 +63,8 @@ def remote_call(filename, binary):
|
||||
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
|
||||
resume = step_two.parse(resume)
|
||||
return resume
|
||||
except Exception as e:
|
||||
cron_logger.error("Resume parser error: " + str(e))
|
||||
except Exception:
|
||||
logger.exception("Resume parser error")
|
||||
return {}
|
||||
|
||||
|
||||
@ -87,7 +86,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
callback(-1, "Resume is not successfully parsed.")
|
||||
raise Exception("Resume parser remote call fail!")
|
||||
callback(0.6, "Done parsing. Chunking...")
|
||||
print(json.dumps(resume, ensure_ascii=False, indent=2))
|
||||
logger.info("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2))
|
||||
|
||||
field_map = {
|
||||
"name_kwd": "姓名/名字",
|
||||
@ -159,7 +158,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
|
||||
doc[n] = resume[n]
|
||||
|
||||
print(doc)
|
||||
logger.info("chunked resume to " + str(doc))
|
||||
KnowledgebaseService.update_parser_config(
|
||||
kwargs["kb_id"], {"field_map": field_map})
|
||||
return [doc]
|
||||
|
@ -32,6 +32,7 @@ from api.utils.file_utils import get_home_cache_dir
|
||||
from rag.utils import num_tokens_from_string, truncate
|
||||
import google.generativeai as genai
|
||||
import json
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
class Base(ABC):
|
||||
def __init__(self, key, model_name):
|
||||
@ -68,7 +69,7 @@ class DefaultEmbedding(Base):
|
||||
DefaultEmbedding._model = FlagModel(os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z]+/", "", model_name)),
|
||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||
use_fp16=torch.cuda.is_available())
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5",
|
||||
local_dir=os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z]+/", "", model_name)),
|
||||
local_dir_use_symlinks=False)
|
||||
@ -189,7 +190,7 @@ class QWenEmbed(Base):
|
||||
)
|
||||
return np.array(resp["output"]["embeddings"][0]
|
||||
["embedding"]), resp["usage"]["total_tokens"]
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
raise Exception("Account abnormal. Please ensure it's on good standing to use QWen's "+self.model_name)
|
||||
return np.array([]), 0
|
||||
|
||||
@ -296,11 +297,11 @@ class YoudaoEmbed(Base):
|
||||
if not LIGHTEN and not YoudaoEmbed._client:
|
||||
from BCEmbedding import EmbeddingModel as qanthing
|
||||
try:
|
||||
print("LOADING BCE...")
|
||||
logger.info("LOADING BCE...")
|
||||
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
|
||||
get_home_cache_dir(),
|
||||
"bce-embedding-base_v1"))
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
YoudaoEmbed._client = qanthing(
|
||||
model_name_or_path=model_name.replace(
|
||||
"maidalun1020", "InfiniFlow"))
|
||||
|
@ -27,6 +27,7 @@ from api.settings import LIGHTEN
|
||||
from api.utils.file_utils import get_home_cache_dir
|
||||
from rag.utils import num_tokens_from_string, truncate
|
||||
import json
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
def sigmoid(x):
|
||||
@ -66,7 +67,7 @@ class DefaultRerank(Base):
|
||||
DefaultRerank._model = FlagReranker(
|
||||
os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z]+/", "", model_name)),
|
||||
use_fp16=torch.cuda.is_available())
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
model_dir = snapshot_download(repo_id=model_name,
|
||||
local_dir=os.path.join(get_home_cache_dir(),
|
||||
re.sub(r"^[a-zA-Z]+/", "", model_name)),
|
||||
@ -126,11 +127,11 @@ class YoudaoRerank(DefaultRerank):
|
||||
with YoudaoRerank._model_lock:
|
||||
if not YoudaoRerank._model:
|
||||
try:
|
||||
print("LOADING BCE...")
|
||||
logger.info("LOADING BCE...")
|
||||
YoudaoRerank._model = RerankerModel(model_name_or_path=os.path.join(
|
||||
get_home_cache_dir(),
|
||||
re.sub(r"^[a-zA-Z]+/", "", model_name)))
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
YoudaoRerank._model = RerankerModel(
|
||||
model_name_or_path=model_name.replace(
|
||||
"maidalun1020", "InfiniFlow"))
|
||||
|
@ -26,6 +26,7 @@ from word2number import w2n
|
||||
from cn2an import cn2an
|
||||
from PIL import Image
|
||||
import json
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
all_codecs = [
|
||||
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
||||
@ -235,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
||||
# wrap up as es documents
|
||||
for ck in chunks:
|
||||
if len(ck.strip()) == 0:continue
|
||||
print("--", ck)
|
||||
logger.debug("-- {}".format(ck))
|
||||
d = copy.deepcopy(doc)
|
||||
if pdf_parser:
|
||||
try:
|
||||
@ -254,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
|
||||
# wrap up as es documents
|
||||
for ck, image in zip(chunks, images):
|
||||
if len(ck.strip()) == 0:continue
|
||||
print("--", ck)
|
||||
logger.debug("-- {}".format(ck))
|
||||
d = copy.deepcopy(doc)
|
||||
d["image"] = image
|
||||
tokenize(d, ck, eng)
|
||||
@ -457,7 +458,7 @@ def hierarchical_merge(bull, sections, depth):
|
||||
|
||||
for i in range(len(cks)):
|
||||
cks[i] = [sections[j] for j in cks[i][::-1]]
|
||||
print("--------------\n", "\n* ".join(cks[i]))
|
||||
logger.info("\n* ".join(cks[i]))
|
||||
|
||||
res = [[]]
|
||||
num = [0]
|
||||
|
@ -22,10 +22,10 @@ import re
|
||||
import string
|
||||
import sys
|
||||
from hanziconv import HanziConv
|
||||
from huggingface_hub import snapshot_download
|
||||
from nltk import word_tokenize
|
||||
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class RagTokenizer:
|
||||
@ -36,7 +36,7 @@ class RagTokenizer:
|
||||
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
|
||||
|
||||
def loadDict_(self, fnm):
|
||||
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
|
||||
logger.info(f"[HUQIE]:Build trie {fnm}")
|
||||
try:
|
||||
of = open(fnm, "r", encoding='utf-8')
|
||||
while True:
|
||||
@ -52,8 +52,9 @@ class RagTokenizer:
|
||||
self.trie_[self.rkey_(line[0])] = 1
|
||||
self.trie_.save(fnm + ".trie")
|
||||
of.close()
|
||||
except Exception as e:
|
||||
print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
|
||||
except Exception:
|
||||
logger.exception(f"[HUQIE]:Build trie {fnm} failed")
|
||||
|
||||
|
||||
def __init__(self, debug=False):
|
||||
self.DEBUG = debug
|
||||
@ -68,8 +69,8 @@ class RagTokenizer:
|
||||
try:
|
||||
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
|
||||
return
|
||||
except Exception as e:
|
||||
print("[HUQIE]:Build default trie", file=sys.stderr)
|
||||
except Exception:
|
||||
logger.exception("[HUQIE]:Build default trie")
|
||||
self.trie_ = datrie.Trie(string.printable)
|
||||
|
||||
self.loadDict_(self.DIR_ + ".txt")
|
||||
@ -78,7 +79,7 @@ class RagTokenizer:
|
||||
try:
|
||||
self.trie_ = datrie.Trie.load(fnm + ".trie")
|
||||
return
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
self.trie_ = datrie.Trie(string.printable)
|
||||
self.loadDict_(fnm)
|
||||
|
||||
@ -173,8 +174,7 @@ class RagTokenizer:
|
||||
tks.append(tk)
|
||||
F /= len(tks)
|
||||
L /= len(tks)
|
||||
if self.DEBUG:
|
||||
print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
|
||||
logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
|
||||
return tks, B / len(tks) + L + F
|
||||
|
||||
def sortTks_(self, tkslist):
|
||||
@ -278,8 +278,8 @@ class RagTokenizer:
|
||||
tks, s = self.maxForward_(L)
|
||||
tks1, s1 = self.maxBackward_(L)
|
||||
if self.DEBUG:
|
||||
print("[FW]", tks, s)
|
||||
print("[BW]", tks1, s1)
|
||||
logger.debug("[FW] {} {}".format(tks, s))
|
||||
logger.debug("[BW] {} {}".format(tks1, s1))
|
||||
|
||||
i, j, _i, _j = 0, 0, 0, 0
|
||||
same = 0
|
||||
@ -326,8 +326,7 @@ class RagTokenizer:
|
||||
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
||||
|
||||
res = " ".join(self.english_normalize_(res))
|
||||
if self.DEBUG:
|
||||
print("[TKS]", self.merge_(res))
|
||||
logger.debug("[TKS] {}".format(self.merge_(res)))
|
||||
return self.merge_(res)
|
||||
|
||||
def fine_grained_tokenize(self, tks):
|
||||
@ -418,30 +417,30 @@ if __name__ == '__main__':
|
||||
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
||||
tks = tknzr.tokenize(
|
||||
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("虽然我不怎么玩")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit()
|
||||
tknzr.DEBUG = False
|
||||
@ -451,5 +450,5 @@ if __name__ == '__main__':
|
||||
line = of.readline()
|
||||
if not line:
|
||||
break
|
||||
print(tknzr.tokenize(line))
|
||||
logger.info(tknzr.tokenize(line))
|
||||
of.close()
|
||||
|
@ -19,7 +19,7 @@ import json
|
||||
from typing import List, Optional, Dict, Union
|
||||
from dataclasses import dataclass
|
||||
|
||||
from rag.settings import doc_store_logger
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils import rmSpace
|
||||
from rag.nlp import rag_tokenizer, query
|
||||
import numpy as np
|
||||
@ -83,7 +83,7 @@ class Dealer:
|
||||
orderBy.desc("create_timestamp_flt")
|
||||
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
else:
|
||||
highlightFields = ["content_ltks", "title_tks"] if highlight else []
|
||||
matchText, keywords = self.qryr.question(qst, min_match=0.3)
|
||||
@ -91,7 +91,7 @@ class Dealer:
|
||||
matchExprs = [matchText]
|
||||
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
else:
|
||||
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
|
||||
q_vec = matchDense.embedding_data
|
||||
@ -102,7 +102,7 @@ class Dealer:
|
||||
|
||||
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
|
||||
# If result is empty, try again with lower min_match
|
||||
if total == 0:
|
||||
@ -112,7 +112,7 @@ class Dealer:
|
||||
matchDense.extra_options["similarity"] = 0.17
|
||||
res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
doc_store_logger.info("Dealer.search 2 TOTAL: {}".format(total))
|
||||
logger.info("Dealer.search 2 TOTAL: {}".format(total))
|
||||
|
||||
for k in keywords:
|
||||
kwds.add(k)
|
||||
@ -123,7 +123,7 @@ class Dealer:
|
||||
continue
|
||||
kwds.add(kk)
|
||||
|
||||
doc_store_logger.info(f"TOTAL: {total}")
|
||||
logger.info(f"TOTAL: {total}")
|
||||
ids=self.dataStore.getChunkIds(res)
|
||||
keywords=list(kwds)
|
||||
highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
|
||||
@ -180,7 +180,7 @@ class Dealer:
|
||||
continue
|
||||
idx.append(i)
|
||||
pieces_.append(t)
|
||||
doc_store_logger.info("{} => {}".format(answer, pieces_))
|
||||
logger.info("{} => {}".format(answer, pieces_))
|
||||
if not pieces_:
|
||||
return answer, set([])
|
||||
|
||||
@ -201,7 +201,7 @@ class Dealer:
|
||||
chunks_tks,
|
||||
tkweight, vtweight)
|
||||
mx = np.max(sim) * 0.99
|
||||
doc_store_logger.info("{} SIM: {}".format(pieces_[i], mx))
|
||||
logger.info("{} SIM: {}".format(pieces_[i], mx))
|
||||
if mx < thr:
|
||||
continue
|
||||
cites[idx[i]] = list(
|
||||
|
@ -17,10 +17,10 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import re
|
||||
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Dealer:
|
||||
@ -32,15 +32,15 @@ class Dealer:
|
||||
path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json")
|
||||
try:
|
||||
self.dictionary = json.load(open(path, 'r'))
|
||||
except Exception as e:
|
||||
logging.warn("Missing synonym.json")
|
||||
except Exception:
|
||||
logger.warn("Missing synonym.json")
|
||||
self.dictionary = {}
|
||||
|
||||
if not redis:
|
||||
logging.warning(
|
||||
logger.warning(
|
||||
"Realtime synonym is disabled, since no redis connection.")
|
||||
if not len(self.dictionary.keys()):
|
||||
logging.warning(f"Fail to load synonym")
|
||||
logger.warning("Fail to load synonym")
|
||||
|
||||
self.redis = redis
|
||||
self.load()
|
||||
@ -64,7 +64,7 @@ class Dealer:
|
||||
d = json.loads(d)
|
||||
self.dictionary = d
|
||||
except Exception as e:
|
||||
logging.error("Fail to load synonym!" + str(e))
|
||||
logger.error("Fail to load synonym!" + str(e))
|
||||
|
||||
def lookup(self, tk):
|
||||
self.lookup_num += 1
|
||||
|
@ -21,6 +21,7 @@ import os
|
||||
import numpy as np
|
||||
from rag.nlp import rag_tokenizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Dealer:
|
||||
@ -81,12 +82,12 @@ class Dealer:
|
||||
self.ne, self.df = {}, {}
|
||||
try:
|
||||
self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
|
||||
except Exception as e:
|
||||
print("[WARNING] Load ner.json FAIL!")
|
||||
except Exception:
|
||||
logger.warning("Load ner.json FAIL!")
|
||||
try:
|
||||
self.df = load_dict(os.path.join(fnm, "term.freq"))
|
||||
except Exception as e:
|
||||
print("[WARNING] Load term.freq FAIL!")
|
||||
except Exception:
|
||||
logger.warning("Load term.freq FAIL!")
|
||||
|
||||
def pretoken(self, txt, num=False, stpwd=True):
|
||||
patt = [
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import re
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor, ALL_COMPLETED, wait
|
||||
from threading import Lock
|
||||
from typing import Tuple
|
||||
@ -22,7 +21,8 @@ import umap
|
||||
import numpy as np
|
||||
from sklearn.mixture import GaussianMixture
|
||||
|
||||
from rag.utils import num_tokens_from_string, truncate
|
||||
from rag.utils import truncate
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||
@ -62,14 +62,13 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||
{"temperature": 0.3, "max_tokens": self._max_token}
|
||||
)
|
||||
cnt = re.sub("(······\n由于长度的原因,回答被截断了,要继续吗?|For the content length reason, it stopped, continue?)", "", cnt)
|
||||
print("SUM:", cnt)
|
||||
logger.info(f"SUM: {cnt}")
|
||||
embds, _ = self._embd_model.encode([cnt])
|
||||
with lock:
|
||||
if not len(embds[0]): return
|
||||
chunks.append((cnt, embds[0]))
|
||||
except Exception as e:
|
||||
print(e, flush=True)
|
||||
traceback.print_stack(e)
|
||||
logger.exception("summarize got exception")
|
||||
return e
|
||||
|
||||
labels = []
|
||||
@ -105,7 +104,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||
ck_idx = [i+start for i in range(len(lbls)) if lbls[i] == c]
|
||||
threads.append(executor.submit(summarize, ck_idx, lock))
|
||||
wait(threads, return_when=ALL_COMPLETED)
|
||||
print([t.result() for t in threads])
|
||||
logger.info(str([t.result() for t in threads]))
|
||||
|
||||
assert len(chunks) - end == n_clusters, "{} vs. {}".format(len(chunks) - end, n_clusters)
|
||||
labels.extend(lbls)
|
||||
|
@ -14,15 +14,11 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import os
|
||||
import logging
|
||||
from api.utils import get_base_config, decrypt_database_config
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import LoggerFactory, getLogger
|
||||
|
||||
|
||||
# Server
|
||||
RAG_CONF_PATH = os.path.join(get_project_base_directory(), "conf")
|
||||
SUBPROCESS_STD_LOG_NAME = "std.log"
|
||||
|
||||
ES = get_base_config("es", {})
|
||||
INFINITY = get_base_config("infinity", {"uri": "infinity:23817"})
|
||||
@ -36,29 +32,6 @@ except Exception:
|
||||
pass
|
||||
DOC_MAXIMUM_SIZE = int(os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024))
|
||||
|
||||
# Logger
|
||||
LoggerFactory.set_directory(
|
||||
os.path.join(
|
||||
get_project_base_directory(),
|
||||
"logs",
|
||||
"rag"))
|
||||
# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
|
||||
LoggerFactory.LEVEL = 30
|
||||
|
||||
doc_store_logger = getLogger("doc_store")
|
||||
minio_logger = getLogger("minio")
|
||||
s3_logger = getLogger("s3")
|
||||
azure_logger = getLogger("azure")
|
||||
cron_logger = getLogger("cron_logger")
|
||||
chunk_logger = getLogger("chunk_logger")
|
||||
database_logger = getLogger("database")
|
||||
|
||||
formatter = logging.Formatter("%(asctime)-15s %(levelname)-8s (%(process)d) %(message)s")
|
||||
for logger in [doc_store_logger, minio_logger, s3_logger, azure_logger, cron_logger, chunk_logger, database_logger]:
|
||||
logger.setLevel(logging.INFO)
|
||||
for handler in logger.handlers:
|
||||
handler.setFormatter(fmt=formatter)
|
||||
|
||||
SVR_QUEUE_NAME = "rag_flow_svr_queue"
|
||||
SVR_QUEUE_RETENTION = 60*60
|
||||
SVR_QUEUE_MAX_LEN = 1024
|
||||
|
@ -13,20 +13,19 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import random
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from api.db.db_models import close_connection
|
||||
from api.db.services.task_service import TaskService
|
||||
from rag.settings import cron_logger
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
|
||||
|
||||
def collect():
|
||||
doc_locations = TaskService.get_ongoing_doc_name()
|
||||
print(doc_locations)
|
||||
logger.info(doc_locations)
|
||||
if len(doc_locations) == 0:
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -35,7 +34,7 @@ def collect():
|
||||
def main():
|
||||
locations = collect()
|
||||
if not locations:return
|
||||
print("TASKS:", len(locations))
|
||||
logger.info(f"TASKS: {len(locations)}")
|
||||
for kb_id, loc in locations:
|
||||
try:
|
||||
if REDIS_CONN.is_alive():
|
||||
@ -44,7 +43,7 @@ def main():
|
||||
if REDIS_CONN.exist(key):continue
|
||||
file_bin = STORAGE_IMPL.get(kb_id, loc)
|
||||
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
||||
cron_logger.info("CACHE: {}".format(loc))
|
||||
logger.info("CACHE: {}".format(loc))
|
||||
except Exception as e:
|
||||
traceback.print_stack(e)
|
||||
except Exception as e:
|
||||
|
@ -17,6 +17,7 @@ import discord
|
||||
import requests
|
||||
import base64
|
||||
import asyncio
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
URL = '{YOUR_IP_ADDRESS:PORT}/v1/api/completion_aibotk' # Default: https://demo.ragflow.io/v1/api/completion_aibotk
|
||||
|
||||
@ -36,7 +37,7 @@ client = discord.Client(intents=intents)
|
||||
|
||||
@client.event
|
||||
async def on_ready():
|
||||
print(f'We have logged in as {client.user}')
|
||||
logger.info(f'We have logged in as {client.user}')
|
||||
|
||||
|
||||
@client.event
|
||||
|
@ -22,7 +22,6 @@ import copy
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from io import BytesIO
|
||||
@ -43,8 +42,8 @@ from api.db.db_models import close_connection
|
||||
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
|
||||
from rag.nlp import search, rag_tokenizer
|
||||
from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
|
||||
from rag.settings import database_logger, SVR_QUEUE_NAME
|
||||
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
|
||||
from api.utils.log_utils import logger, LOG_FILE
|
||||
from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME
|
||||
from rag.utils import rmSpace, num_tokens_from_string
|
||||
from rag.utils.redis_conn import REDIS_CONN, Payload
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
@ -90,8 +89,8 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing...
|
||||
d["progress"] = prog
|
||||
try:
|
||||
TaskService.update_progress(task_id, d)
|
||||
except Exception as e:
|
||||
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
||||
except Exception:
|
||||
logger.exception(f"set_progress({task_id}) got exception")
|
||||
|
||||
close_connection()
|
||||
if cancel:
|
||||
@ -110,8 +109,8 @@ def collect():
|
||||
if not PAYLOAD:
|
||||
time.sleep(1)
|
||||
return pd.DataFrame()
|
||||
except Exception as e:
|
||||
cron_logger.error("Get task event from queue exception:" + str(e))
|
||||
except Exception:
|
||||
logger.exception("Get task event from queue exception")
|
||||
return pd.DataFrame()
|
||||
|
||||
msg = PAYLOAD.get_message()
|
||||
@ -119,11 +118,11 @@ def collect():
|
||||
return pd.DataFrame()
|
||||
|
||||
if TaskService.do_cancel(msg["id"]):
|
||||
cron_logger.info("Task {} has been canceled.".format(msg["id"]))
|
||||
logger.info("Task {} has been canceled.".format(msg["id"]))
|
||||
return pd.DataFrame()
|
||||
tasks = TaskService.get_tasks(msg["id"])
|
||||
if not tasks:
|
||||
cron_logger.warning("{} empty task!".format(msg["id"]))
|
||||
logger.warning("{} empty task!".format(msg["id"]))
|
||||
return []
|
||||
|
||||
tasks = pd.DataFrame(tasks)
|
||||
@ -152,33 +151,29 @@ def build(row):
|
||||
st = timer()
|
||||
bucket, name = File2DocumentService.get_storage_address(doc_id=row["doc_id"])
|
||||
binary = get_storage_binary(bucket, name)
|
||||
cron_logger.info(
|
||||
logger.info(
|
||||
"From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
|
||||
except TimeoutError:
|
||||
callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.")
|
||||
cron_logger.error(
|
||||
"Minio {}/{}: Fetch file from minio timeout.".format(row["location"], row["name"]))
|
||||
logger.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"]))
|
||||
return
|
||||
except Exception as e:
|
||||
if re.search("(No such file|not found)", str(e)):
|
||||
callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"])
|
||||
else:
|
||||
callback(-1, "Get file from minio: %s" % str(e).replace("'", ""))
|
||||
traceback.print_exc()
|
||||
logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
|
||||
return
|
||||
|
||||
try:
|
||||
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
|
||||
to_page=row["to_page"], lang=row["language"], callback=callback,
|
||||
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
|
||||
cron_logger.info(
|
||||
"Chunking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
|
||||
logger.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"]))
|
||||
except Exception as e:
|
||||
callback(-1, "Internal server error while chunking: %s" %
|
||||
str(e).replace("'", ""))
|
||||
cron_logger.error(
|
||||
"Chunking {}/{}: {}".format(row["location"], row["name"], str(e)))
|
||||
traceback.print_exc()
|
||||
logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
|
||||
return
|
||||
|
||||
docs = []
|
||||
@ -214,14 +209,13 @@ def build(row):
|
||||
st = timer()
|
||||
STORAGE_IMPL.put(row["kb_id"], d["id"], output_buffer.getvalue())
|
||||
el += timer() - st
|
||||
except Exception as e:
|
||||
cron_logger.error(str(e))
|
||||
traceback.print_exc()
|
||||
except Exception:
|
||||
logger.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"]))
|
||||
|
||||
d["img_id"] = "{}-{}".format(row["kb_id"], d["id"])
|
||||
del d["image"]
|
||||
docs.append(d)
|
||||
cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
|
||||
logger.info("MINIO PUT({}):{}".format(row["name"], el))
|
||||
|
||||
if row["parser_config"].get("auto_keywords", 0):
|
||||
callback(msg="Start to generate keywords for every chunk ...")
|
||||
@ -347,7 +341,7 @@ def main():
|
||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
||||
except Exception as e:
|
||||
callback(-1, msg=str(e))
|
||||
cron_logger.error(str(e))
|
||||
logger.exception("LLMBundle got exception")
|
||||
continue
|
||||
|
||||
if r.get("task_type", "") == "raptor":
|
||||
@ -356,12 +350,12 @@ def main():
|
||||
cks, tk_count, vector_size = run_raptor(r, chat_mdl, embd_mdl, callback)
|
||||
except Exception as e:
|
||||
callback(-1, msg=str(e))
|
||||
cron_logger.error(str(e))
|
||||
logger.exception("run_raptor got exception")
|
||||
continue
|
||||
else:
|
||||
st = timer()
|
||||
cks = build(r)
|
||||
cron_logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
|
||||
logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
|
||||
if cks is None:
|
||||
continue
|
||||
if not cks:
|
||||
@ -377,12 +371,12 @@ def main():
|
||||
tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
|
||||
except Exception as e:
|
||||
callback(-1, "Embedding error:{}".format(str(e)))
|
||||
cron_logger.error(str(e))
|
||||
logger.exception("run_rembedding got exception")
|
||||
tk_count = 0
|
||||
cron_logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||
logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||
callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st))
|
||||
|
||||
# cron_logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
||||
# logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
||||
init_kb(r, vector_size)
|
||||
chunk_count = len(set([c["id"] for c in cks]))
|
||||
st = timer()
|
||||
@ -393,11 +387,11 @@ def main():
|
||||
if b % 128 == 0:
|
||||
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
|
||||
|
||||
cron_logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||
logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||
if es_r:
|
||||
callback(-1, "Insert chunk error, detail info please check ragflow-logs/api/cron_logger.log. Please also check ES status!")
|
||||
callback(-1, f"Insert chunk error, detail info please check {LOG_FILE}. Please also check ES status!")
|
||||
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
||||
cron_logger.error('Insert chunk error: ' + str(es_r))
|
||||
logger.error('Insert chunk error: ' + str(es_r))
|
||||
else:
|
||||
if TaskService.do_cancel(r["id"]):
|
||||
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
||||
@ -405,7 +399,7 @@ def main():
|
||||
callback(1., "Done!")
|
||||
DocumentService.increment_chunk_num(
|
||||
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
||||
cron_logger.info(
|
||||
logger.info(
|
||||
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
|
||||
r["id"], tk_count, len(cks), timer() - st))
|
||||
|
||||
@ -421,16 +415,16 @@ def report_status():
|
||||
obj[CONSUMER_NAME].append(timer())
|
||||
obj[CONSUMER_NAME] = obj[CONSUMER_NAME][-60:]
|
||||
REDIS_CONN.set_obj("TASKEXE", obj, 60*2)
|
||||
except Exception as e:
|
||||
print("[Exception]:", str(e))
|
||||
except Exception:
|
||||
logger.exception("report_status got exception")
|
||||
time.sleep(30)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
peewee_logger = logging.getLogger('peewee')
|
||||
peewee_logger.propagate = False
|
||||
peewee_logger.addHandler(database_logger.handlers[0])
|
||||
peewee_logger.setLevel(database_logger.level)
|
||||
peewee_logger.addHandler(logger.handlers[0])
|
||||
peewee_logger.setLevel(logger.handlers[0].level)
|
||||
|
||||
exe = ThreadPoolExecutor(max_workers=1)
|
||||
exe.submit(report_status)
|
||||
|
@ -2,7 +2,6 @@ import os
|
||||
import time
|
||||
from io import BytesIO
|
||||
from rag import settings
|
||||
from rag.settings import azure_logger
|
||||
from rag.utils import singleton
|
||||
from azure.storage.blob import ContainerClient
|
||||
|
||||
@ -19,14 +18,13 @@ class RAGFlowAzureSasBlob(object):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
self.conn = ContainerClient.from_container_url(self.container_url + "?" + self.sas_token)
|
||||
except Exception as e:
|
||||
azure_logger.error(
|
||||
"Fail to connect %s " % self.container_url + str(e))
|
||||
except Exception:
|
||||
logger.exception("Fail to connect %s " % self.container_url)
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
@ -40,24 +38,24 @@ class RAGFlowAzureSasBlob(object):
|
||||
for _ in range(3):
|
||||
try:
|
||||
return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
|
||||
except Exception as e:
|
||||
azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
def rm(self, bucket, fnm):
|
||||
try:
|
||||
self.conn.delete_blob(fnm)
|
||||
except Exception as e:
|
||||
azure_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
try:
|
||||
r = self.conn.download_blob(fnm)
|
||||
return r.read()
|
||||
except Exception as e:
|
||||
azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -65,16 +63,16 @@ class RAGFlowAzureSasBlob(object):
|
||||
def obj_exist(self, bucket, fnm):
|
||||
try:
|
||||
return self.conn.get_blob_client(fnm).exists()
|
||||
except Exception as e:
|
||||
azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
return False
|
||||
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
for _ in range(10):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception as e:
|
||||
azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
@ -1,7 +1,6 @@
|
||||
import os
|
||||
import time
|
||||
from rag import settings
|
||||
from rag.settings import azure_logger
|
||||
from rag.utils import singleton
|
||||
from azure.identity import ClientSecretCredential, AzureAuthorityHosts
|
||||
from azure.storage.filedatalake import FileSystemClient
|
||||
@ -22,15 +21,14 @@ class RAGFlowAzureSpnBlob(object):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
credentials = ClientSecretCredential(tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.secret, authority=AzureAuthorityHosts.AZURE_CHINA)
|
||||
self.conn = FileSystemClient(account_url=self.account_url, file_system_name=self.container_name, credential=credentials)
|
||||
except Exception as e:
|
||||
azure_logger.error(
|
||||
"Fail to connect %s " % self.account_url + str(e))
|
||||
except Exception:
|
||||
logger.exception("Fail to connect %s" % self.account_url)
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
@ -48,16 +46,16 @@ class RAGFlowAzureSpnBlob(object):
|
||||
f = self.conn.create_file(fnm)
|
||||
f.append_data(binary, offset=0, length=len(binary))
|
||||
return f.flush_data(len(binary))
|
||||
except Exception as e:
|
||||
azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
def rm(self, bucket, fnm):
|
||||
try:
|
||||
self.conn.delete_file(fnm)
|
||||
except Exception as e:
|
||||
azure_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
@ -65,8 +63,8 @@ class RAGFlowAzureSpnBlob(object):
|
||||
client = self.conn.get_file_client(fnm)
|
||||
r = client.download_file()
|
||||
return r.read()
|
||||
except Exception as e:
|
||||
azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -75,16 +73,16 @@ class RAGFlowAzureSpnBlob(object):
|
||||
try:
|
||||
client = self.conn.get_file_client(fnm)
|
||||
return client.exists()
|
||||
except Exception as e:
|
||||
azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
return False
|
||||
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
for _ in range(10):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception as e:
|
||||
azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
@ -9,7 +9,7 @@ import copy
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
|
||||
from elastic_transport import ConnectionTimeout
|
||||
from rag.settings import doc_store_logger
|
||||
from api.utils.log_utils import logger
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
@ -17,7 +17,7 @@ import polars as pl
|
||||
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, FusionExpr
|
||||
from rag.nlp import is_english, rag_tokenizer
|
||||
|
||||
doc_store_logger.info("Elasticsearch sdk version: "+str(elasticsearch.__version__))
|
||||
logger.info("Elasticsearch sdk version: "+str(elasticsearch.__version__))
|
||||
|
||||
|
||||
@singleton
|
||||
@ -34,10 +34,10 @@ class ESConnection(DocStoreConnection):
|
||||
)
|
||||
if self.es:
|
||||
self.info = self.es.info()
|
||||
doc_store_logger.info("Connect to es.")
|
||||
logger.info("Connect to es.")
|
||||
break
|
||||
except Exception as e:
|
||||
doc_store_logger.error("Fail to connect to es: " + str(e))
|
||||
except Exception:
|
||||
logger.exception("Fail to connect to es")
|
||||
time.sleep(1)
|
||||
if not self.es.ping():
|
||||
raise Exception("Can't connect to ES cluster")
|
||||
@ -70,14 +70,14 @@ class ESConnection(DocStoreConnection):
|
||||
return IndicesClient(self.es).create(index=indexName,
|
||||
settings=self.mapping["settings"],
|
||||
mappings=self.mapping["mappings"])
|
||||
except Exception as e:
|
||||
doc_store_logger.error("ES create index error %s ----%s" % (indexName, str(e)))
|
||||
except Exception:
|
||||
logger.exception("ES create index error %s" % (indexName))
|
||||
|
||||
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
||||
try:
|
||||
return self.es.indices.delete(indexName, allow_no_indices=True)
|
||||
except Exception as e:
|
||||
doc_store_logger.error("ES delete index error %s ----%s" % (indexName, str(e)))
|
||||
except Exception:
|
||||
logger.exception("ES delete index error %s" % (indexName))
|
||||
|
||||
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
|
||||
s = Index(indexName, self.es)
|
||||
@ -85,7 +85,7 @@ class ESConnection(DocStoreConnection):
|
||||
try:
|
||||
return s.exists()
|
||||
except Exception as e:
|
||||
doc_store_logger.error("ES indexExist: " + str(e))
|
||||
logger.exception("ES indexExist")
|
||||
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
|
||||
continue
|
||||
return False
|
||||
@ -159,7 +159,7 @@ class ESConnection(DocStoreConnection):
|
||||
if limit > 0:
|
||||
s = s[offset:limit]
|
||||
q = s.to_dict()
|
||||
doc_store_logger.info("ESConnection.search [Q]: " + json.dumps(q))
|
||||
# logger.info("ESConnection.search [Q]: " + json.dumps(q))
|
||||
|
||||
for i in range(3):
|
||||
try:
|
||||
@ -171,18 +171,14 @@ class ESConnection(DocStoreConnection):
|
||||
_source=True)
|
||||
if str(res.get("timed_out", "")).lower() == "true":
|
||||
raise Exception("Es Timeout.")
|
||||
doc_store_logger.info("ESConnection.search res: " + str(res))
|
||||
logger.info("ESConnection.search res: " + str(res))
|
||||
return res
|
||||
except Exception as e:
|
||||
doc_store_logger.error(
|
||||
"ES search exception: " +
|
||||
str(e) +
|
||||
"\n[Q]: " +
|
||||
str(q))
|
||||
logger.exception("ES search [Q]: " + str(q))
|
||||
if str(e).find("Timeout") > 0:
|
||||
continue
|
||||
raise e
|
||||
doc_store_logger.error("ES search timeout for 3 times!")
|
||||
logger.error("ES search timeout for 3 times!")
|
||||
raise Exception("ES search timeout.")
|
||||
|
||||
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
|
||||
@ -198,15 +194,11 @@ class ESConnection(DocStoreConnection):
|
||||
chunk["id"] = chunkId
|
||||
return chunk
|
||||
except Exception as e:
|
||||
doc_store_logger.error(
|
||||
"ES get exception: " +
|
||||
str(e) +
|
||||
"[Q]: " +
|
||||
chunkId)
|
||||
logger.exception(f"ES get({chunkId}) got exception")
|
||||
if str(e).find("Timeout") > 0:
|
||||
continue
|
||||
raise e
|
||||
doc_store_logger.error("ES search timeout for 3 times!")
|
||||
logger.error("ES search timeout for 3 times!")
|
||||
raise Exception("ES search timeout.")
|
||||
|
||||
def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str) -> list[str]:
|
||||
@ -236,7 +228,7 @@ class ESConnection(DocStoreConnection):
|
||||
res.append(str(item[action]["_id"]) + ":" + str(item[action]["error"]))
|
||||
return res
|
||||
except Exception as e:
|
||||
doc_store_logger.warning("Fail to bulk: " + str(e))
|
||||
logger.warning("Fail to bulk: " + str(e))
|
||||
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
|
||||
time.sleep(3)
|
||||
continue
|
||||
@ -253,9 +245,7 @@ class ESConnection(DocStoreConnection):
|
||||
self.es.update(index=indexName, id=chunkId, doc=doc)
|
||||
return True
|
||||
except Exception as e:
|
||||
doc_store_logger.error(
|
||||
"ES update exception: " + str(e) + " id:" + str(id) +
|
||||
json.dumps(newValue, ensure_ascii=False))
|
||||
logger.exception(f"ES failed to update(index={indexName}, id={id}, doc={json.dumps(condition, ensure_ascii=False)})")
|
||||
if str(e).find("Timeout") > 0:
|
||||
continue
|
||||
else:
|
||||
@ -292,8 +282,7 @@ class ESConnection(DocStoreConnection):
|
||||
_ = ubq.execute()
|
||||
return True
|
||||
except Exception as e:
|
||||
doc_store_logger.error("ES update exception: " +
|
||||
str(e) + "[Q]:" + str(bqry.to_dict()))
|
||||
logger.error("ES update exception: " + str(e) + "[Q]:" + str(bqry.to_dict()))
|
||||
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
|
||||
continue
|
||||
return False
|
||||
@ -315,7 +304,7 @@ class ESConnection(DocStoreConnection):
|
||||
qry.must.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception("Condition value must be int, str or list.")
|
||||
doc_store_logger.info("ESConnection.delete [Q]: " + json.dumps(qry.to_dict()))
|
||||
logger.info("ESConnection.delete [Q]: " + json.dumps(qry.to_dict()))
|
||||
for _ in range(10):
|
||||
try:
|
||||
res = self.es.delete_by_query(
|
||||
@ -324,7 +313,7 @@ class ESConnection(DocStoreConnection):
|
||||
refresh=True)
|
||||
return res["deleted"]
|
||||
except Exception as e:
|
||||
doc_store_logger.warning("Fail to delete: " + str(filter) + str(e))
|
||||
logger.warning("Fail to delete: " + str(filter) + str(e))
|
||||
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
|
||||
time.sleep(3)
|
||||
continue
|
||||
@ -407,7 +396,7 @@ class ESConnection(DocStoreConnection):
|
||||
SQL
|
||||
"""
|
||||
def sql(self, sql: str, fetch_size: int, format: str):
|
||||
doc_store_logger.info(f"ESConnection.sql get sql: {sql}")
|
||||
logger.info(f"ESConnection.sql get sql: {sql}")
|
||||
sql = re.sub(r"[ `]+", " ", sql)
|
||||
sql = sql.replace("%", "")
|
||||
replaces = []
|
||||
@ -424,17 +413,17 @@ class ESConnection(DocStoreConnection):
|
||||
|
||||
for p, r in replaces:
|
||||
sql = sql.replace(p, r, 1)
|
||||
doc_store_logger.info(f"ESConnection.sql to es: {sql}")
|
||||
logger.info(f"ESConnection.sql to es: {sql}")
|
||||
|
||||
for i in range(3):
|
||||
try:
|
||||
res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout="2s")
|
||||
return res
|
||||
except ConnectionTimeout:
|
||||
doc_store_logger.error("ESConnection.sql timeout [Q]: " + sql)
|
||||
logger.exception("ESConnection.sql timeout [Q]: " + sql)
|
||||
continue
|
||||
except Exception as e:
|
||||
doc_store_logger.error(f"ESConnection.sql failure: {sql} => " + str(e))
|
||||
except Exception:
|
||||
logger.exception("ESConnection.sql got exception [Q]: " + sql)
|
||||
return None
|
||||
doc_store_logger.error("ESConnection.sql timeout for 3 times!")
|
||||
logger.error("ESConnection.sql timeout for 3 times!")
|
||||
return None
|
||||
|
@ -7,7 +7,7 @@ from infinity.common import ConflictType, InfinityException
|
||||
from infinity.index import IndexInfo, IndexType
|
||||
from infinity.connection_pool import ConnectionPool
|
||||
from rag import settings
|
||||
from rag.settings import doc_store_logger
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils import singleton
|
||||
import polars as pl
|
||||
from polars.series.series import Series
|
||||
@ -22,7 +22,6 @@ from rag.utils.doc_store_conn import (
|
||||
OrderByExpr,
|
||||
)
|
||||
|
||||
|
||||
def equivalent_condition_to_str(condition: dict) -> str:
|
||||
assert "_id" not in condition
|
||||
cond = list()
|
||||
@ -56,7 +55,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
host, port = infinity_uri.split(":")
|
||||
infinity_uri = infinity.common.NetworkAddress(host, int(port))
|
||||
self.connPool = ConnectionPool(infinity_uri)
|
||||
doc_store_logger.info(f"Connected to infinity {infinity_uri}.")
|
||||
logger.info(f"Connected to infinity {infinity_uri}.")
|
||||
|
||||
"""
|
||||
Database operations
|
||||
@ -71,7 +70,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
TODO: Infinity-sdk provides health() to wrap `show global variables` and `show tables`
|
||||
"""
|
||||
inf_conn = self.connPool.get_conn()
|
||||
res = infinity.show_current_node()
|
||||
res = inf_conn.show_current_node()
|
||||
self.connPool.release_conn(inf_conn)
|
||||
color = "green" if res.error_code == 0 else "red"
|
||||
res2 = {
|
||||
@ -132,7 +131,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
)
|
||||
break
|
||||
self.connPool.release_conn(inf_conn)
|
||||
doc_store_logger.info(
|
||||
logger.info(
|
||||
f"INFINITY created table {table_name}, vector size {vectorSize}"
|
||||
)
|
||||
|
||||
@ -142,7 +141,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
db_instance.drop_table(table_name, ConflictType.Ignore)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
doc_store_logger.info(f"INFINITY dropped table {table_name}")
|
||||
logger.info(f"INFINITY dropped table {table_name}")
|
||||
|
||||
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
@ -152,8 +151,8 @@ class InfinityConnection(DocStoreConnection):
|
||||
_ = db_instance.get_table(table_name)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
return True
|
||||
except Exception as e:
|
||||
doc_store_logger.error("INFINITY indexExist: " + str(e))
|
||||
except Exception:
|
||||
logger.exception("INFINITY indexExist")
|
||||
return False
|
||||
|
||||
"""
|
||||
@ -263,7 +262,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
df_list.append(kb_res)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
res = pl.concat(df_list)
|
||||
doc_store_logger.info("INFINITY search tables: " + str(table_list))
|
||||
logger.info("INFINITY search tables: " + str(table_list))
|
||||
return res
|
||||
|
||||
def get(
|
||||
@ -318,8 +317,8 @@ class InfinityConnection(DocStoreConnection):
|
||||
str_filter = f"id IN ({str_ids})"
|
||||
table_instance.delete(str_filter)
|
||||
# for doc in documents:
|
||||
# doc_store_logger.info(f"insert position_list: {doc['position_list']}")
|
||||
# doc_store_logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
|
||||
# logger.info(f"insert position_list: {doc['position_list']}")
|
||||
# logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
|
||||
table_instance.insert(documents)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
doc_store_logger.info(f"inserted into {table_name} {str_ids}.")
|
||||
@ -329,7 +328,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
|
||||
) -> bool:
|
||||
# if 'position_list' in newValue:
|
||||
# doc_store_logger.info(f"update position_list: {newValue['position_list']}")
|
||||
# logger.info(f"upsert position_list: {newValue['position_list']}")
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
@ -350,7 +349,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
doc_store_logger.warning(
|
||||
logger.warning(
|
||||
f"Skipped deleting `{filter}` from table {table_name} since the table doesn't exist."
|
||||
)
|
||||
return 0
|
||||
|
@ -1,10 +1,9 @@
|
||||
import os
|
||||
import time
|
||||
from minio import Minio
|
||||
from io import BytesIO
|
||||
from rag import settings
|
||||
from rag.settings import minio_logger
|
||||
from rag.utils import singleton
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
@singleton
|
||||
@ -17,7 +16,7 @@ class RAGFlowMinio(object):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
@ -26,9 +25,9 @@ class RAGFlowMinio(object):
|
||||
secret_key=settings.MINIO["password"],
|
||||
secure=False
|
||||
)
|
||||
except Exception as e:
|
||||
minio_logger.error(
|
||||
"Fail to connect %s " % settings.MINIO["host"] + str(e))
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Fail to connect %s " % settings.MINIO["host"])
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
@ -55,24 +54,24 @@ class RAGFlowMinio(object):
|
||||
len(binary)
|
||||
)
|
||||
return r
|
||||
except Exception as e:
|
||||
minio_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
def rm(self, bucket, fnm):
|
||||
try:
|
||||
self.conn.remove_object(bucket, fnm)
|
||||
except Exception as e:
|
||||
minio_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
try:
|
||||
r = self.conn.get_object(bucket, fnm)
|
||||
return r.read()
|
||||
except Exception as e:
|
||||
minio_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -81,8 +80,8 @@ class RAGFlowMinio(object):
|
||||
try:
|
||||
if self.conn.stat_object(bucket, fnm):return True
|
||||
return False
|
||||
except Exception as e:
|
||||
minio_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
return False
|
||||
|
||||
|
||||
@ -90,8 +89,8 @@ class RAGFlowMinio(object):
|
||||
for _ in range(10):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception as e:
|
||||
minio_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
@ -110,9 +110,8 @@ class RedisDB:
|
||||
#pipeline.expire(queue, exp)
|
||||
pipeline.execute()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e))
|
||||
except Exception:
|
||||
logging.exception("producer" + str(queue) + " got exception")
|
||||
return False
|
||||
|
||||
def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload:
|
||||
@ -143,7 +142,7 @@ class RedisDB:
|
||||
if 'key' in str(e):
|
||||
pass
|
||||
else:
|
||||
logging.warning("[EXCEPTION]consumer: " + str(queue_name) + "||" + str(e))
|
||||
logging.exception("consumer: " + str(queue_name) + " got exception")
|
||||
return None
|
||||
|
||||
def get_unacked_for(self, consumer_name, queue_name, group_name):
|
||||
@ -160,7 +159,7 @@ class RedisDB:
|
||||
except Exception as e:
|
||||
if 'key' in str(e):
|
||||
return
|
||||
logging.warning("[EXCEPTION]xpending_range: " + consumer_name + "||" + str(e))
|
||||
logging.exception("xpending_range: " + consumer_name + " got exception")
|
||||
self.__open__()
|
||||
|
||||
REDIS_CONN = RedisDB()
|
||||
|
@ -4,7 +4,6 @@ from botocore.exceptions import ClientError
|
||||
from botocore.client import Config
|
||||
import time
|
||||
from io import BytesIO
|
||||
from rag.settings import s3_logger
|
||||
from rag.utils import singleton
|
||||
|
||||
@singleton
|
||||
@ -21,7 +20,7 @@ class RAGFlowS3(object):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
@ -40,9 +39,9 @@ class RAGFlowS3(object):
|
||||
aws_secret_access_key=self.secret_key,
|
||||
config=config
|
||||
)
|
||||
except Exception as e:
|
||||
s3_logger.error(
|
||||
"Fail to connect %s " % self.endpoint + str(e))
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Fail to connect %s" % self.endpoint)
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
@ -50,11 +49,11 @@ class RAGFlowS3(object):
|
||||
|
||||
def bucket_exists(self, bucket):
|
||||
try:
|
||||
s3_logger.error(f"head_bucket bucketname {bucket}")
|
||||
logger.debug(f"head_bucket bucketname {bucket}")
|
||||
self.conn.head_bucket(Bucket=bucket)
|
||||
exists = True
|
||||
except ClientError as e:
|
||||
s3_logger.error(f"head_bucket error {bucket}: " + str(e))
|
||||
except ClientError:
|
||||
logger.exception(f"head_bucket error {bucket}")
|
||||
exists = False
|
||||
return exists
|
||||
|
||||
@ -63,7 +62,7 @@ class RAGFlowS3(object):
|
||||
|
||||
if not self.bucket_exists(bucket):
|
||||
self.conn.create_bucket(Bucket=bucket)
|
||||
s3_logger.error(f"create bucket {bucket} ********")
|
||||
logger.debug(f"create bucket {bucket} ********")
|
||||
|
||||
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
|
||||
return r
|
||||
@ -75,25 +74,25 @@ class RAGFlowS3(object):
|
||||
return []
|
||||
|
||||
def put(self, bucket, fnm, binary):
|
||||
s3_logger.error(f"bucket name {bucket}; filename :{fnm}:")
|
||||
logger.debug(f"bucket name {bucket}; filename :{fnm}:")
|
||||
for _ in range(1):
|
||||
try:
|
||||
if not self.bucket_exists(bucket):
|
||||
self.conn.create_bucket(Bucket=bucket)
|
||||
s3_logger.error(f"create bucket {bucket} ********")
|
||||
logger.info(f"create bucket {bucket} ********")
|
||||
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
|
||||
|
||||
return r
|
||||
except Exception as e:
|
||||
s3_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
def rm(self, bucket, fnm):
|
||||
try:
|
||||
self.conn.delete_object(Bucket=bucket, Key=fnm)
|
||||
except Exception as e:
|
||||
s3_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
@ -101,8 +100,8 @@ class RAGFlowS3(object):
|
||||
r = self.conn.get_object(Bucket=bucket, Key=fnm)
|
||||
object_data = r['Body'].read()
|
||||
return object_data
|
||||
except Exception as e:
|
||||
s3_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -128,8 +127,8 @@ class RAGFlowS3(object):
|
||||
ExpiresIn=expires)
|
||||
|
||||
return r
|
||||
except Exception as e:
|
||||
s3_logger.error(f"fail get url {bucket}/{fnm}: " + str(e))
|
||||
except Exception:
|
||||
logger.exception(f"fail get url {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
Loading…
x
Reference in New Issue
Block a user