From 89b05ad79f156744da6b96d97edeeb172f851347 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Fri, 23 Aug 2024 10:21:48 +0800 Subject: [PATCH] fix uploading docx for mind map (#2064) ### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/db/services/document_service.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 328ee924a..83a38650a 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -17,6 +17,8 @@ import hashlib import json import os import random +import re +import traceback from concurrent.futures import ThreadPoolExecutor from copy import deepcopy from datetime import datetime @@ -33,7 +35,7 @@ from graphrag.mind_map_extractor import MindMapExtractor from rag.settings import SVR_QUEUE_NAME from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO -from rag.nlp import search +from rag.nlp import search, rag_tokenizer from api.db import FileType, TaskStatus, ParserType, LLMType from api.db.db_models import DB, Knowledgebase, Tenant, Task @@ -432,6 +434,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id): parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False} exe = ThreadPoolExecutor(max_workers=12) threads = [] + doc_nm = {} + for d, blob in files: + doc_nm[d["id"]] = d["name"] for d, blob in files: kwargs = { "callback": dummy, @@ -504,6 +509,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id): "id": get_uuid(), "doc_id": doc_id, "kb_id": [kb.id], + "docnm_kwd": doc_nm[doc_id], + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc_nm[doc_id])), + "content_ltks": "", "content_with_weight": mind_map, "knowledge_graph_kwd": "mind_map" })