fix uploading docx for mind map (#2064)

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2024-08-23 10:21:48 +08:00 committed by GitHub
parent 884fd83dc7
commit 89b05ad79f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -17,6 +17,8 @@ import hashlib
import json import json
import os import os
import random import random
import re
import traceback
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from copy import deepcopy from copy import deepcopy
from datetime import datetime from datetime import datetime
@ -33,7 +35,7 @@ from graphrag.mind_map_extractor import MindMapExtractor
from rag.settings import SVR_QUEUE_NAME from rag.settings import SVR_QUEUE_NAME
from rag.utils.es_conn import ELASTICSEARCH from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO from rag.utils.minio_conn import MINIO
from rag.nlp import search from rag.nlp import search, rag_tokenizer
from api.db import FileType, TaskStatus, ParserType, LLMType from api.db import FileType, TaskStatus, ParserType, LLMType
from api.db.db_models import DB, Knowledgebase, Tenant, Task from api.db.db_models import DB, Knowledgebase, Tenant, Task
@ -432,6 +434,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False} parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
exe = ThreadPoolExecutor(max_workers=12) exe = ThreadPoolExecutor(max_workers=12)
threads = [] threads = []
doc_nm = {}
for d, blob in files:
doc_nm[d["id"]] = d["name"]
for d, blob in files: for d, blob in files:
kwargs = { kwargs = {
"callback": dummy, "callback": dummy,
@ -504,6 +509,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
"id": get_uuid(), "id": get_uuid(),
"doc_id": doc_id, "doc_id": doc_id,
"kb_id": [kb.id], "kb_id": [kb.id],
"docnm_kwd": doc_nm[doc_id],
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc_nm[doc_id])),
"content_ltks": "",
"content_with_weight": mind_map, "content_with_weight": mind_map,
"knowledge_graph_kwd": "mind_map" "knowledge_graph_kwd": "mind_map"
}) })