diff --git a/api/apps/api_app.py b/api/apps/api_app.py index 05ff44bb4..0086444a2 100644 --- a/api/apps/api_app.py +++ b/api/apps/api_app.py @@ -33,7 +33,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge from itsdangerous import URLSafeTimedSerializer from api.utils.file_utils import filename_type, thumbnail -from rag.utils import MINIO +from rag.utils.minio_conn import MINIO def generate_confirmation_token(tenent_id): diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 81f5b5285..2929e8892 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -21,7 +21,8 @@ from elasticsearch_dsl import Q from rag.app.qa import rmPrefix, beAdoc from rag.nlp import search, huqie -from rag.utils import ELASTICSEARCH, rmSpace +from rag.utils.es_conn import ELASTICSEARCH +from rag.utils import rmSpace from api.db import LLMType, ParserType from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import TenantLLMService diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 2f783da52..d5551d747 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -27,7 +27,7 @@ from flask_login import login_required, current_user from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from rag.nlp import search -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from api.db.services import duplicate_name from api.db.services.knowledgebase_service import KnowledgebaseService from api.utils.api_utils import server_error_response, get_data_error_result, validate_request diff --git a/api/apps/file2document_app.py b/api/apps/file2document_app.py index 7902ad23e..d3861b59b 100644 --- a/api/apps/file2document_app.py +++ b/api/apps/file2document_app.py @@ -29,7 +29,7 @@ from api.db.services.document_service import DocumentService from api.settings import RetCode from api.utils.api_utils import get_json_result from rag.nlp import search -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH @manager.route('/convert', methods=['POST']) diff --git a/api/apps/file_app.py b/api/apps/file_app.py index 17944a9f9..93fd3fdb3 100644 --- a/api/apps/file_app.py +++ b/api/apps/file_app.py @@ -33,7 +33,7 @@ from api.settings import RetCode from api.utils.api_utils import get_json_result from api.utils.file_utils import filename_type from rag.nlp import search -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index ddfa0d2e8..848bf2f15 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -28,7 +28,7 @@ from api.db.db_models import Knowledgebase from api.settings import stat_logger, RetCode from api.utils.api_utils import get_json_result from rag.nlp import search -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH @manager.route('/create', methods=['post']) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index c4ddf98dd..455a64e6e 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -16,7 +16,7 @@ from peewee import Expression from elasticsearch_dsl import Q -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO from rag.nlp import search diff --git a/api/settings.py b/api/settings.py index 6010ab190..4d284325b 100644 --- a/api/settings.py +++ b/api/settings.py @@ -32,7 +32,7 @@ access_logger = getLogger("access") database_logger = getLogger("database") chat_logger = getLogger("chat") -from rag.utils import ELASTICSEARCH +from rag.utils.es_conn import ELASTICSEARCH from rag.nlp import search from api.utils import get_base_config, decrypt_database_config diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py index 30353b31f..b9de7eab7 100644 --- a/deepdoc/parser/__init__.py +++ b/deepdoc/parser/__init__.py @@ -1,6 +1,6 @@ -from .pdf_parser import HuParser as PdfParser, PlainParser -from .docx_parser import HuDocxParser as DocxParser -from .excel_parser import HuExcelParser as ExcelParser -from .ppt_parser import HuPptParser as PptParser +from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser +from .docx_parser import RAGFlowDocxParser as DocxParser +from .excel_parser import RAGFlowExcelParser as ExcelParser +from .ppt_parser import RAGFlowPptParser as PptParser diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index 10a84d5b5..e45b5d51a 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -7,7 +7,7 @@ from rag.nlp import huqie from io import BytesIO -class HuDocxParser: +class RAGFlowDocxParser: def __extract_table_content(self, tb): df = [] diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index b8cbb1665..2c3e67757 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -6,7 +6,7 @@ from io import BytesIO from rag.nlp import find_codec -class HuExcelParser: +class RAGFlowExcelParser: def html(self, fnm): if isinstance(fnm, str): wb = load_workbook(fnm) @@ -74,5 +74,5 @@ class HuExcelParser: if __name__ == "__main__": - psr = HuExcelParser() + psr = RAGFlowExcelParser() psr(sys.argv[1]) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 96f4bdd28..67b9d172c 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -23,7 +23,7 @@ from huggingface_hub import snapshot_download logging.getLogger("pdfminer").setLevel(logging.WARNING) -class HuParser: +class RAGFlowPdfParser: def __init__(self): self.ocr = OCR() if hasattr(self, "model_speciess"): diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 7266112d7..9b67336d0 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -14,7 +14,7 @@ from io import BytesIO from pptx import Presentation -class HuPptParser(object): +class RAGFlowPptParser(object): def __init__(self): super().__init__() diff --git a/deepdoc/vision/t_ocr.py b/deepdoc/vision/t_ocr.py index d30f3c2bc..37a87fd3f 100644 --- a/deepdoc/vision/t_ocr.py +++ b/deepdoc/vision/t_ocr.py @@ -11,10 +11,6 @@ # limitations under the License. # -from deepdoc.vision.seeit import draw_box -from deepdoc.vision import OCR, init_in_out -import argparse -import numpy as np import os import sys sys.path.insert( @@ -25,6 +21,11 @@ sys.path.insert( os.path.abspath(__file__)), '../../'))) +from deepdoc.vision.seeit import draw_box +from deepdoc.vision import OCR, init_in_out +import argparse +import numpy as np + def main(args): ocr = OCR() diff --git a/deepdoc/vision/t_recognizer.py b/deepdoc/vision/t_recognizer.py index a04afa413..5a1c8592d 100644 --- a/deepdoc/vision/t_recognizer.py +++ b/deepdoc/vision/t_recognizer.py @@ -10,17 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from deepdoc.vision.seeit import draw_box -from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out -from api.utils.file_utils import get_project_base_directory -import argparse -import os -import sys -import re - -import numpy as np - +import os, sys sys.path.insert( 0, os.path.abspath( @@ -29,6 +19,13 @@ sys.path.insert( os.path.abspath(__file__)), '../../'))) +from deepdoc.vision.seeit import draw_box +from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out +from api.utils.file_utils import get_project_base_directory +import argparse +import re +import numpy as np + def main(args): images, outputs = init_in_out(args) diff --git a/rag/llm/__init__.py b/rag/llm/__init__.py index 3b035a435..14d789a5a 100644 --- a/rag/llm/__init__.py +++ b/rag/llm/__init__.py @@ -22,7 +22,7 @@ EmbeddingModel = { "Ollama": OllamaEmbed, "OpenAI": OpenAIEmbed, "Xinference": XinferenceEmbed, - "Tongyi-Qianwen": HuEmbedding, #QWenEmbed, + "Tongyi-Qianwen": DefaultEmbedding, #QWenEmbed, "ZHIPU-AI": ZhipuEmbed, "FastEmbed": FastEmbed, "Youdao": YoudaoEmbed diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index 597dbfdc9..ea6436cf2 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -56,7 +56,7 @@ class Base(ABC): raise NotImplementedError("Please implement encode method!") -class HuEmbedding(Base): +class DefaultEmbedding(Base): def __init__(self, *args, **kwargs): """ If you have trouble downloading HuggingFace models, -_^ this might help!! diff --git a/rag/nlp/huchunk.py b/rag/nlp/huchunk.py deleted file mode 100644 index 8c4c6fc91..000000000 --- a/rag/nlp/huchunk.py +++ /dev/null @@ -1,475 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import re -import os -import copy -import base64 -import magic -from dataclasses import dataclass -from typing import List -import numpy as np -from io import BytesIO - - -class HuChunker: - - @dataclass - class Fields: - text_chunks: List = None - table_chunks: List = None - - def __init__(self): - self.MAX_LVL = 12 - self.proj_patt = [ - (r"第[零一二三四五六七八九十百]+章", 1), - (r"第[零一二三四五六七八九十百]+[条节]", 2), - (r"[零一二三四五六七八九十百]+[、 ]", 3), - (r"[\((][零一二三四五六七八九十百]+[)\)]", 4), - (r"[0-9]+(、|\.[ ]|\.[^0-9])", 5), - (r"[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 6), - (r"[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 7), - (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 8), - (r".{,48}[::??]@", 9), - (r"[0-9]+)", 10), - (r"[\((][0-9]+[)\)]", 11), - (r"[零一二三四五六七八九十百]+是", 12), - (r"[⚫•➢✓ ]", 12) - ] - self.lines = [] - - def _garbage(self, txt): - patt = [ - r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)", - r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)", - r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)", - r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)", - r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)", - r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)", - r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)", - r"^(时间|签字|签章)[::]", - r"(参考文献|目录索引|图表索引)", - r"[ ]*年[ ]+月[ ]+日", - r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$", - r"\.{10,}", - r"(———————END|帮我转发|欢迎收藏|快来关注我吧)" - ] - return any([re.search(p, txt) for p in patt]) - - def _proj_match(self, line): - for p, j in self.proj_patt: - if re.match(p, line): - return j - return - - def _does_proj_match(self): - mat = [None for _ in range(len(self.lines))] - for i in range(len(self.lines)): - mat[i] = self._proj_match(self.lines[i]) - return mat - - def naive_text_chunk(self, text, ti="", MAX_LEN=612): - if text: - self.lines = [l.strip().replace(u'\u3000', u' ') - .replace(u'\xa0', u'') - for l in text.split("\n\n")] - self.lines = [l for l in self.lines if not self._garbage(l)] - self.lines = [re.sub(r"([ ]+| )", " ", l) - for l in self.lines if l] - if not self.lines: - return [] - arr = self.lines - - res = [""] - i = 0 - while i < len(arr): - a = arr[i] - if not a: - i += 1 - continue - if len(a) > MAX_LEN: - a_ = a.split("\n") - if len(a_) >= 2: - arr.pop(i) - for j in range(2, len(a_) + 1): - if len("\n".join(a_[:j])) >= MAX_LEN: - arr.insert(i, "\n".join(a_[:j - 1])) - arr.insert(i + 1, "\n".join(a_[j - 1:])) - break - else: - assert False, f"Can't split: {a}" - continue - - if len(res[-1]) < MAX_LEN / 3: - res[-1] += "\n" + a - else: - res.append(a) - i += 1 - - if ti: - for i in range(len(res)): - if res[i].find("——来自") >= 0: - continue - res[i] += f"\t——来自“{ti}”" - - return res - - def _merge(self): - # merge continuous same level text - lines = [self.lines[0]] if self.lines else [] - for i in range(1, len(self.lines)): - if self.mat[i] == self.mat[i - 1] \ - and len(lines[-1]) < 256 \ - and len(self.lines[i]) < 256: - lines[-1] += "\n" + self.lines[i] - continue - lines.append(self.lines[i]) - self.lines = lines - self.mat = self._does_proj_match() - return self.mat - - def text_chunks(self, text): - if text: - self.lines = [l.strip().replace(u'\u3000', u' ') - .replace(u'\xa0', u'') - for l in re.split(r"[\r\n]", text)] - self.lines = [l for l in self.lines if not self._garbage(l)] - self.lines = [l for l in self.lines if l] - self.mat = self._does_proj_match() - mat = self._merge() - - tree = [] - for i in range(len(self.lines)): - tree.append({"proj": mat[i], - "children": [], - "read": False}) - # find all children - for i in range(len(self.lines) - 1): - if tree[i]["proj"] is None: - continue - ed = i + 1 - while ed < len(tree) and (tree[ed]["proj"] is None or - tree[ed]["proj"] > tree[i]["proj"]): - ed += 1 - - nxt = tree[i]["proj"] + 1 - st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]]) - while nxt not in st: - nxt += 1 - if nxt > self.MAX_LVL: - break - if nxt <= self.MAX_LVL: - for j in range(i + 1, ed): - if tree[j]["proj"] is not None: - break - tree[i]["children"].append(j) - for j in range(i + 1, ed): - if tree[j]["proj"] != nxt: - continue - tree[i]["children"].append(j) - else: - for j in range(i + 1, ed): - tree[i]["children"].append(j) - - # get DFS combinations, find all the paths to leaf - paths = [] - - def dfs(i, path): - nonlocal tree, paths - path.append(i) - tree[i]["read"] = True - if len(self.lines[i]) > 256: - paths.append(path) - return - if not tree[i]["children"]: - if len(path) > 1 or len(self.lines[i]) >= 32: - paths.append(path) - return - for j in tree[i]["children"]: - dfs(j, copy.deepcopy(path)) - - for i, t in enumerate(tree): - if t["read"]: - continue - dfs(i, []) - - # concat txt on the path for all paths - res = [] - lines = np.array(self.lines) - for p in paths: - if len(p) < 2: - tree[p[0]]["read"] = False - continue - txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]] - res.append(txt) - # concat continuous orphans - assert len(tree) == len(lines) - ii = 0 - while ii < len(tree): - if tree[ii]["read"]: - ii += 1 - continue - txt = lines[ii] - e = ii + 1 - while e < len(tree) and not tree[e]["read"] and len(txt) < 256: - txt += "\n" + lines[e] - e += 1 - res.append(txt) - ii = e - - # if the node has not been read, find its daddy - def find_daddy(st): - nonlocal lines, tree - proj = tree[st]["proj"] - if len(self.lines[st]) > 512: - return [st] - if proj is None: - proj = self.MAX_LVL + 1 - for i in range(st - 1, -1, -1): - if tree[i]["proj"] and tree[i]["proj"] < proj: - a = [st] + find_daddy(i) - return a - return [] - - return res - - -class PdfChunker(HuChunker): - - def __init__(self, pdf_parser): - self.pdf = pdf_parser - super().__init__() - - def tableHtmls(self, pdfnm): - _, tbls = self.pdf(pdfnm, return_html=True) - res = [] - for img, arr in tbls: - if arr[0].find("