From 83c6b1f308f9f6e93a28e7ac546e559538ae8925 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Wed, 13 Nov 2024 16:59:19 +0800 Subject: [PATCH] set DLA active for KG (#3386) ### What problem does this PR solve? ### Type of change - [x] Refactoring --- api/apps/document_app.py | 2 +- rag/app/knowledge_graph.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index a43ec5783..652cd4858 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -537,7 +537,7 @@ def parse(): options.add_argument('--disable-dev-shm-usage') driver = Chrome(options=options) driver.get(url) - sections = RAGFlowHtmlParser()(driver.page_source) + sections = RAGFlowHtmlParser()("", binary=driver.page_source) return get_json_result(data="\n".join(sections)) if 'file' not in request.files: diff --git a/rag/app/knowledge_graph.py b/rag/app/knowledge_graph.py index b7bcddd64..74fbbec10 100644 --- a/rag/app/knowledge_graph.py +++ b/rag/app/knowledge_graph.py @@ -9,10 +9,10 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): parser_config = kwargs.get( "parser_config", { - "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": False}) + "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": True}) eng = lang.lower() == "english" - parser_config["layout_recognize"] = False + parser_config["layout_recognize"] = True sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, parser_config=parser_config, callback=callback) chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,