diff --git a/api/apps/api_app.py b/api/apps/api_app.py index 05ff44bb4..0086444a2 100644 --- a/api/apps/api_app.py +++ b/api/apps/api_app.py @@ -33,7 +33,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge from itsdangerous import URLSafeTimedSerializer from api.utils.file_utils import filename_type, thumbnail -from rag.utils import MINIO +from rag.utils.minio_conn import MINIO def generate_confirmation_token(tenent_id): diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 81f5b5285..2929e8892 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -21,7 +21,8 @@ from elasticsearch_dsl import Q from rag.app.qa import rmPrefix, beAdoc from rag.nlp import search, huqie -from rag.utils import ELASTICSEARCH, rmSpace +from rag.utils.es_conn import ELASTICSEARCH +from rag.utils import rmSpace from api.db import LLMType, ParserType from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import TenantLLMService diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 2f783da52..d5551d747 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -27,7 +27,7 @@ from flask_login import login_required, current_user from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from rag.nlp import search -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from api.db.services import duplicate_name from api.db.services.knowledgebase_service import KnowledgebaseService from api.utils.api_utils import server_error_response, get_data_error_result, validate_request diff --git a/api/apps/file2document_app.py b/api/apps/file2document_app.py index 7902ad23e..d3861b59b 100644 --- a/api/apps/file2document_app.py +++ b/api/apps/file2document_app.py @@ -29,7 +29,7 @@ from api.db.services.document_service import DocumentService from api.settings import RetCode from api.utils.api_utils import get_json_result from rag.nlp import search -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH @manager.route('/convert', methods=['POST']) diff --git a/api/apps/file_app.py b/api/apps/file_app.py index 17944a9f9..93fd3fdb3 100644 --- a/api/apps/file_app.py +++ b/api/apps/file_app.py @@ -33,7 +33,7 @@ from api.settings import RetCode from api.utils.api_utils import get_json_result from api.utils.file_utils import filename_type from rag.nlp import search -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index ddfa0d2e8..848bf2f15 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -28,7 +28,7 @@ from api.db.db_models import Knowledgebase from api.settings import stat_logger, RetCode from api.utils.api_utils import get_json_result from rag.nlp import search -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH @manager.route('/create', methods=['post']) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index c4ddf98dd..455a64e6e 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -16,7 +16,7 @@ from peewee import Expression from elasticsearch_dsl import Q -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO from rag.nlp import search diff --git a/api/settings.py b/api/settings.py index 6010ab190..4d284325b 100644 --- a/api/settings.py +++ b/api/settings.py @@ -32,7 +32,7 @@ access_logger = getLogger("access") database_logger = getLogger("database") chat_logger = getLogger("chat") -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from rag.nlp import search from api.utils import get_base_config, decrypt_database_config diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py index 30353b31f..b9de7eab7 100644 --- a/deepdoc/parser/__init__.py +++ b/deepdoc/parser/__init__.py @@ -1,6 +1,6 @@ -from .pdf_parser import HuParser as PdfParser, PlainParser -from .docx_parser import HuDocxParser as DocxParser -from .excel_parser import HuExcelParser as ExcelParser -from .ppt_parser import HuPptParser as PptParser +from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser +from .docx_parser import RAGFlowDocxParser as DocxParser +from .excel_parser import RAGFlowExcelParser as ExcelParser +from .ppt_parser import RAGFlowPptParser as PptParser diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index 10a84d5b5..e45b5d51a 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -7,7 +7,7 @@ from rag.nlp import huqie from io import BytesIO -class HuDocxParser: +class RAGFlowDocxParser: def __extract_table_content(self, tb): df = [] diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index b8cbb1665..2c3e67757 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -6,7 +6,7 @@ from io import BytesIO from rag.nlp import find_codec -class HuExcelParser: +class RAGFlowExcelParser: def html(self, fnm): if isinstance(fnm, str): wb = load_workbook(fnm) @@ -74,5 +74,5 @@ class HuExcelParser: if __name__ == "__main__": - psr = HuExcelParser() + psr = RAGFlowExcelParser() psr(sys.argv[1]) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 96f4bdd28..67b9d172c 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -23,7 +23,7 @@ from huggingface_hub import snapshot_download logging.getLogger("pdfminer").setLevel(logging.WARNING) -class HuParser: +class RAGFlowPdfParser: def __init__(self): self.ocr = OCR() if hasattr(self, "model_speciess"): diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 7266112d7..9b67336d0 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -14,7 +14,7 @@ from io import BytesIO from pptx import Presentation -class HuPptParser(object): +class RAGFlowPptParser(object): def __init__(self): super().__init__() diff --git a/deepdoc/vision/t_ocr.py b/deepdoc/vision/t_ocr.py index d30f3c2bc..37a87fd3f 100644 --- a/deepdoc/vision/t_ocr.py +++ b/deepdoc/vision/t_ocr.py @@ -11,10 +11,6 @@ # limitations under the License. # -from deepdoc.vision.seeit import draw_box -from deepdoc.vision import OCR, init_in_out -import argparse -import numpy as np import os import sys sys.path.insert( @@ -25,6 +21,11 @@ sys.path.insert( os.path.abspath(__file__)), '../../'))) +from deepdoc.vision.seeit import draw_box +from deepdoc.vision import OCR, init_in_out +import argparse +import numpy as np + def main(args): ocr = OCR() diff --git a/deepdoc/vision/t_recognizer.py b/deepdoc/vision/t_recognizer.py index a04afa413..5a1c8592d 100644 --- a/deepdoc/vision/t_recognizer.py +++ b/deepdoc/vision/t_recognizer.py @@ -10,17 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from deepdoc.vision.seeit import draw_box -from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out -from api.utils.file_utils import get_project_base_directory -import argparse -import os -import sys -import re - -import numpy as np - +import os, sys sys.path.insert( 0, os.path.abspath( @@ -29,6 +19,13 @@ sys.path.insert( os.path.abspath(__file__)), '../../'))) +from deepdoc.vision.seeit import draw_box +from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out +from api.utils.file_utils import get_project_base_directory +import argparse +import re +import numpy as np + def main(args): images, outputs = init_in_out(args) diff --git a/rag/llm/__init__.py b/rag/llm/__init__.py index 3b035a435..14d789a5a 100644 --- a/rag/llm/__init__.py +++ b/rag/llm/__init__.py @@ -22,7 +22,7 @@ EmbeddingModel = { "Ollama": OllamaEmbed, "OpenAI": OpenAIEmbed, "Xinference": XinferenceEmbed, - "Tongyi-Qianwen": HuEmbedding, #QWenEmbed, + "Tongyi-Qianwen": DefaultEmbedding, #QWenEmbed, "ZHIPU-AI": ZhipuEmbed, "FastEmbed": FastEmbed, "Youdao": YoudaoEmbed diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index 597dbfdc9..ea6436cf2 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -56,7 +56,7 @@ class Base(ABC): raise NotImplementedError("Please implement encode method!") -class HuEmbedding(Base): +class DefaultEmbedding(Base): def __init__(self, *args, **kwargs): """ If you have trouble downloading HuggingFace models, -_^ this might help!! diff --git a/rag/nlp/huchunk.py b/rag/nlp/huchunk.py deleted file mode 100644 index 8c4c6fc91..000000000 --- a/rag/nlp/huchunk.py +++ /dev/null @@ -1,475 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import re -import os -import copy -import base64 -import magic -from dataclasses import dataclass -from typing import List -import numpy as np -from io import BytesIO - - -class HuChunker: - - @dataclass - class Fields: - text_chunks: List = None - table_chunks: List = None - - def __init__(self): - self.MAX_LVL = 12 - self.proj_patt = [ - (r"第[零一二三四五六七八九十百]+章", 1), - (r"第[零一二三四五六七八九十百]+[条节]", 2), - (r"[零一二三四五六七八九十百]+[、  ]", 3), - (r"[\((][零一二三四五六七八九十百]+[)\)]", 4), - (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5), - (r"[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 6), - (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7), - (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8), - (r".{,48}[::??]@", 9), - (r"[0-9]+)", 10), - (r"[\((][0-9]+[)\)]", 11), - (r"[零一二三四五六七八九十百]+是", 12), - (r"[⚫•➢✓ ]", 12) - ] - self.lines = [] - - def _garbage(self, txt): - patt = [ - r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)", - r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)", - r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)", - r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)", - r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)", - r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)", - r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)", - r"^(时间|签字|签章)[::]", - r"(参考文献|目录索引|图表索引)", - r"[ ]*年[ ]+月[ ]+日", - r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$", - r"\.{10,}", - r"(———————END|帮我转发|欢迎收藏|快来关注我吧)" - ] - return any([re.search(p, txt) for p in patt]) - - def _proj_match(self, line): - for p, j in self.proj_patt: - if re.match(p, line): - return j - return - - def _does_proj_match(self): - mat = [None for _ in range(len(self.lines))] - for i in range(len(self.lines)): - mat[i] = self._proj_match(self.lines[i]) - return mat - - def naive_text_chunk(self, text, ti="", MAX_LEN=612): - if text: - self.lines = [l.strip().replace(u'\u3000', u' ') - .replace(u'\xa0', u'') - for l in text.split("\n\n")] - self.lines = [l for l in self.lines if not self._garbage(l)] - self.lines = [re.sub(r"([ ]+| )", " ", l) - for l in self.lines if l] - if not self.lines: - return [] - arr = self.lines - - res = [""] - i = 0 - while i < len(arr): - a = arr[i] - if not a: - i += 1 - continue - if len(a) > MAX_LEN: - a_ = a.split("\n") - if len(a_) >= 2: - arr.pop(i) - for j in range(2, len(a_) + 1): - if len("\n".join(a_[:j])) >= MAX_LEN: - arr.insert(i, "\n".join(a_[:j - 1])) - arr.insert(i + 1, "\n".join(a_[j - 1:])) - break - else: - assert False, f"Can't split: {a}" - continue - - if len(res[-1]) < MAX_LEN / 3: - res[-1] += "\n" + a - else: - res.append(a) - i += 1 - - if ti: - for i in range(len(res)): - if res[i].find("——来自") >= 0: - continue - res[i] += f"\t——来自“{ti}”" - - return res - - def _merge(self): - # merge continuous same level text - lines = [self.lines[0]] if self.lines else [] - for i in range(1, len(self.lines)): - if self.mat[i] == self.mat[i - 1] \ - and len(lines[-1]) < 256 \ - and len(self.lines[i]) < 256: - lines[-1] += "\n" + self.lines[i] - continue - lines.append(self.lines[i]) - self.lines = lines - self.mat = self._does_proj_match() - return self.mat - - def text_chunks(self, text): - if text: - self.lines = [l.strip().replace(u'\u3000', u' ') - .replace(u'\xa0', u'') - for l in re.split(r"[\r\n]", text)] - self.lines = [l for l in self.lines if not self._garbage(l)] - self.lines = [l for l in self.lines if l] - self.mat = self._does_proj_match() - mat = self._merge() - - tree = [] - for i in range(len(self.lines)): - tree.append({"proj": mat[i], - "children": [], - "read": False}) - # find all children - for i in range(len(self.lines) - 1): - if tree[i]["proj"] is None: - continue - ed = i + 1 - while ed < len(tree) and (tree[ed]["proj"] is None or - tree[ed]["proj"] > tree[i]["proj"]): - ed += 1 - - nxt = tree[i]["proj"] + 1 - st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]]) - while nxt not in st: - nxt += 1 - if nxt > self.MAX_LVL: - break - if nxt <= self.MAX_LVL: - for j in range(i + 1, ed): - if tree[j]["proj"] is not None: - break - tree[i]["children"].append(j) - for j in range(i + 1, ed): - if tree[j]["proj"] != nxt: - continue - tree[i]["children"].append(j) - else: - for j in range(i + 1, ed): - tree[i]["children"].append(j) - - # get DFS combinations, find all the paths to leaf - paths = [] - - def dfs(i, path): - nonlocal tree, paths - path.append(i) - tree[i]["read"] = True - if len(self.lines[i]) > 256: - paths.append(path) - return - if not tree[i]["children"]: - if len(path) > 1 or len(self.lines[i]) >= 32: - paths.append(path) - return - for j in tree[i]["children"]: - dfs(j, copy.deepcopy(path)) - - for i, t in enumerate(tree): - if t["read"]: - continue - dfs(i, []) - - # concat txt on the path for all paths - res = [] - lines = np.array(self.lines) - for p in paths: - if len(p) < 2: - tree[p[0]]["read"] = False - continue - txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]] - res.append(txt) - # concat continuous orphans - assert len(tree) == len(lines) - ii = 0 - while ii < len(tree): - if tree[ii]["read"]: - ii += 1 - continue - txt = lines[ii] - e = ii + 1 - while e < len(tree) and not tree[e]["read"] and len(txt) < 256: - txt += "\n" + lines[e] - e += 1 - res.append(txt) - ii = e - - # if the node has not been read, find its daddy - def find_daddy(st): - nonlocal lines, tree - proj = tree[st]["proj"] - if len(self.lines[st]) > 512: - return [st] - if proj is None: - proj = self.MAX_LVL + 1 - for i in range(st - 1, -1, -1): - if tree[i]["proj"] and tree[i]["proj"] < proj: - a = [st] + find_daddy(i) - return a - return [] - - return res - - -class PdfChunker(HuChunker): - - def __init__(self, pdf_parser): - self.pdf = pdf_parser - super().__init__() - - def tableHtmls(self, pdfnm): - _, tbls = self.pdf(pdfnm, return_html=True) - res = [] - for img, arr in tbls: - if arr[0].find("") < 0: - continue - buffered = BytesIO() - if img: - img.save(buffered, format="JPEG") - img_str = base64.b64encode( - buffered.getvalue()).decode('utf-8') if img else "" - res.append({"table": arr[0], "image": img_str}) - return res - - def html(self, pdfnm): - txts, tbls = self.pdf(pdfnm, return_html=True) - res = [] - txt_cks = self.text_chunks(txts) - for txt, img in [(self.pdf.remove_tag(c), self.pdf.crop(c)) - for c in txt_cks]: - buffered = BytesIO() - if img: - img.save(buffered, format="JPEG") - img_str = base64.b64encode( - buffered.getvalue()).decode('utf-8') if img else "" - res.append({"table": "

%s

" % txt.replace("\n", "
"), - "image": img_str}) - - for img, arr in tbls: - if not arr: - continue - buffered = BytesIO() - if img: - img.save(buffered, format="JPEG") - img_str = base64.b64encode( - buffered.getvalue()).decode('utf-8') if img else "" - res.append({"table": arr[0], "image": img_str}) - - return res - - def __call__(self, pdfnm, return_image=True, naive_chunk=False): - flds = self.Fields() - text, tbls = self.pdf(pdfnm) - fnm = pdfnm - txt_cks = self.text_chunks(text) if not naive_chunk else \ - self.naive_text_chunk(text, ti=fnm if isinstance(fnm, str) else "") - flds.text_chunks = [(self.pdf.remove_tag(c), - self.pdf.crop(c) if return_image else None) for c in txt_cks] - - flds.table_chunks = [(arr, img if return_image else None) - for img, arr in tbls] - return flds - - -class DocxChunker(HuChunker): - - def __init__(self, doc_parser): - self.doc = doc_parser - super().__init__() - - def _does_proj_match(self): - mat = [] - for s in self.styles: - s = s.split(" ")[-1] - try: - mat.append(int(s)) - except Exception as e: - mat.append(None) - return mat - - def _merge(self): - i = 1 - while i < len(self.lines): - if self.mat[i] == self.mat[i - 1] \ - and len(self.lines[i - 1]) < 256 \ - and len(self.lines[i]) < 256: - self.lines[i - 1] += "\n" + self.lines[i] - self.styles.pop(i) - self.lines.pop(i) - self.mat.pop(i) - continue - i += 1 - self.mat = self._does_proj_match() - return self.mat - - def __call__(self, fnm): - flds = self.Fields() - flds.title = os.path.splitext( - os.path.basename(fnm))[0] if isinstance( - fnm, type("")) else "" - secs, tbls = self.doc(fnm) - self.lines = [l for l, s in secs] - self.styles = [s for l, s in secs] - - txt_cks = self.text_chunks("") - flds.text_chunks = [(t, None) for t in txt_cks if not self._garbage(t)] - flds.table_chunks = [(tb, None) for tb in tbls for t in tb if t] - return flds - - -class ExcelChunker(HuChunker): - - def __init__(self, excel_parser): - self.excel = excel_parser - super().__init__() - - def __call__(self, fnm): - flds = self.Fields() - flds.text_chunks = [(t, None) for t in self.excel(fnm)] - flds.table_chunks = [] - return flds - - -class PptChunker(HuChunker): - - def __init__(self): - super().__init__() - - def __extract(self, shape): - if shape.shape_type == 19: - tb = shape.table - rows = [] - for i in range(1, len(tb.rows)): - rows.append("; ".join([tb.cell( - 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) - return "\n".join(rows) - - if shape.has_text_frame: - return shape.text_frame.text - - if shape.shape_type == 6: - texts = [] - for p in shape.shapes: - t = self.__extract(p) - if t: - texts.append(t) - return "\n".join(texts) - - def __call__(self, fnm): - from pptx import Presentation - ppt = Presentation(fnm) if isinstance( - fnm, str) else Presentation( - BytesIO(fnm)) - txts = [] - for slide in ppt.slides: - texts = [] - for shape in slide.shapes: - txt = self.__extract(shape) - if txt: - texts.append(txt) - txts.append("\n".join(texts)) - - import aspose.slides as slides - import aspose.pydrawing as drawing - imgs = [] - with slides.Presentation(BytesIO(fnm)) as presentation: - for slide in presentation.slides: - buffered = BytesIO() - slide.get_thumbnail( - 0.5, 0.5).save( - buffered, drawing.imaging.ImageFormat.jpeg) - imgs.append(buffered.getvalue()) - assert len(imgs) == len( - txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) - - flds = self.Fields() - flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))] - flds.table_chunks = [] - - return flds - - -class TextChunker(HuChunker): - - @dataclass - class Fields: - text_chunks: List = None - table_chunks: List = None - - def __init__(self): - super().__init__() - - @staticmethod - def is_binary_file(file_path): - mime = magic.Magic(mime=True) - if isinstance(file_path, str): - file_type = mime.from_file(file_path) - else: - file_type = mime.from_buffer(file_path) - if 'text' in file_type: - return False - else: - return True - - def __call__(self, fnm): - flds = self.Fields() - if self.is_binary_file(fnm): - return flds - txt = "" - if isinstance(fnm, str): - with open(fnm, "r") as f: - txt = f.read() - else: - txt = fnm.decode("utf-8") - flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)] - flds.table_chunks = [] - return flds - - -if __name__ == "__main__": - import sys - sys.path.append(os.path.dirname(__file__) + "/../") - if sys.argv[1].split(".")[-1].lower() == "pdf": - from deepdoc.parser import PdfParser - ckr = PdfChunker(PdfParser()) - if sys.argv[1].split(".")[-1].lower().find("doc") >= 0: - from deepdoc.parser import DocxParser - ckr = DocxChunker(DocxParser()) - if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0: - from deepdoc.parser import ExcelParser - ckr = ExcelChunker(ExcelParser()) - - # ckr.html(sys.argv[1]) - print(ckr(sys.argv[1])) diff --git a/rag/nlp/synonym.py b/rag/nlp/synonym.py index c06f5766a..d6fccd325 100644 --- a/rag/nlp/synonym.py +++ b/rag/nlp/synonym.py @@ -17,12 +17,12 @@ class Dealer: try: self.dictionary = json.load(open(path, 'r')) except Exception as e: - logging.warning("Missing synonym.json") + logging.warn("Missing synonym.json") self.dictionary = {} if not redis: logging.warning( - "Real-time synonym is disabled, since no redis connection.") + "Realtime synonym is disabled, since no redis connection.") if not len(self.dictionary.keys()): logging.warning(f"Fail to load synonym") diff --git a/rag/svr/cache_file_svr.py b/rag/svr/cache_file_svr.py index 4eeaab629..f63c043de 100644 --- a/rag/svr/cache_file_svr.py +++ b/rag/svr/cache_file_svr.py @@ -4,7 +4,7 @@ import traceback from api.db.db_models import close_connection from api.db.services.task_service import TaskService -from rag.utils import MINIO +from rag.utils.minio_conn import MINIO from rag.utils.redis_conn import REDIS_CONN diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index d7b57d586..c479ca75c 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -24,9 +24,9 @@ from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.task_service import TaskService from deepdoc.parser import PdfParser -from deepdoc.parser.excel_parser import HuExcelParser +from deepdoc.parser.excel_parser import RAGFlowExcelParser from rag.settings import cron_logger -from rag.utils import MINIO +from rag.utils.minio_conn import MINIO from rag.utils import findMaxTm import pandas as pd from api.db import FileType, TaskStatus @@ -121,7 +121,7 @@ def dispatch(): tsks.append(task) elif r["parser_id"] == "table": - rn = HuExcelParser.row_number( + rn = RAGFlowExcelParser.row_number( r["name"], file_bin) for i in range(0, rn, 3000): task = new_task() diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 032d9ea58..4e9086984 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -26,7 +26,7 @@ import traceback from functools import partial from api.db.services.file2document_service import File2DocumentService -from rag.utils import MINIO +from rag.utils.minio_conn import MINIO from api.db.db_models import close_connection from rag.settings import database_logger from rag.settings import cron_logger, DOC_MAXIMUM_SIZE @@ -35,7 +35,7 @@ import numpy as np from elasticsearch_dsl import Q from multiprocessing.context import TimeoutError from api.db.services.task_service import TaskService -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from timeit import default_timer as timer from rag.utils import rmSpace, findMaxTm diff --git a/rag/utils/__init__.py b/rag/utils/__init__.py index f8d805aea..853611100 100644 --- a/rag/utils/__init__.py +++ b/rag/utils/__init__.py @@ -15,9 +15,6 @@ def singleton(cls, *args, **kw): return _singleton -from .minio_conn import MINIO -from .es_conn import ELASTICSEARCH - def rmSpace(txt): txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE) return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE) diff --git a/rag/utils/es_conn.py b/rag/utils/es_conn.py index a3ceaf36d..87348c6aa 100644 --- a/rag/utils/es_conn.py +++ b/rag/utils/es_conn.py @@ -15,7 +15,7 @@ es_logger.info("Elasticsearch version: "+str(elasticsearch.__version__)) @singleton -class HuEs: +class ESConnection: def __init__(self): self.info = {} self.conn() @@ -454,4 +454,4 @@ class HuEs: scroll_size = len(page['hits']['hits']) -ELASTICSEARCH = HuEs() +ELASTICSEARCH = ESConnection() diff --git a/rag/utils/minio_conn.py b/rag/utils/minio_conn.py index a1a4e45a8..fbef33b5f 100644 --- a/rag/utils/minio_conn.py +++ b/rag/utils/minio_conn.py @@ -8,7 +8,7 @@ from rag.utils import singleton @singleton -class HuMinio(object): +class RAGFlowMinio(object): def __init__(self): self.conn = None self.__open__() @@ -86,10 +86,12 @@ class HuMinio(object): time.sleep(1) return -MINIO = HuMinio() + +MINIO = RAGFlowMinio() + if __name__ == "__main__": - conn = HuMinio() + conn = RAGFlowMinio() fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg" from PIL import Image img = Image.open(fnm)