fix uploading docx for mind map (#2064)

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2024-08-23 10:21:48 +08:00 committed by GitHub
parent 884fd83dc7
commit 89b05ad79f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -17,6 +17,8 @@ import hashlib
import json
import os
import random
import re
import traceback
from concurrent.futures import ThreadPoolExecutor
from copy import deepcopy
from datetime import datetime
@ -33,7 +35,7 @@ from graphrag.mind_map_extractor import MindMapExtractor
from rag.settings import SVR_QUEUE_NAME
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO
from rag.nlp import search
from rag.nlp import search, rag_tokenizer
from api.db import FileType, TaskStatus, ParserType, LLMType
from api.db.db_models import DB, Knowledgebase, Tenant, Task
@ -432,6 +434,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
exe = ThreadPoolExecutor(max_workers=12)
threads = []
doc_nm = {}
for d, blob in files:
doc_nm[d["id"]] = d["name"]
for d, blob in files:
kwargs = {
"callback": dummy,
@ -504,6 +509,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
"id": get_uuid(),
"doc_id": doc_id,
"kb_id": [kb.id],
"docnm_kwd": doc_nm[doc_id],
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc_nm[doc_id])),
"content_ltks": "",
"content_with_weight": mind_map,
"knowledge_graph_kwd": "mind_map"
})