From d73a75506e77adec6c8af66d23824b10178db73c Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 13 Aug 2024 19:42:28 +0800 Subject: [PATCH] fix mind map bug (#1934) ### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/document_app.py | 18 ++++++++++++------ api/db/services/api_service.py | 1 + graphrag/mind_map_extractor.py | 2 +- rag/app/naive.py | 3 +-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 76a0e4b54..5d7e0953c 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -452,7 +452,7 @@ def get_image(image_id): @login_required @validate_request("conversation_id") def upload_and_parse(): - req = request.json + from rag.app import presentation, picture, naive, audio, email if 'file' not in request.files: return get_json_result( data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR) @@ -463,7 +463,7 @@ def upload_and_parse(): return get_json_result( data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR) - e, conv = ConversationService.get_by_id(req["conversation_id"]) + e, conv = ConversationService.get_by_id(request.form.get("conversation_id")) if not e: return get_data_error_result(retmsg="Conversation not found!") e, dia = DialogService.get_by_id(conv.dialog_id) @@ -487,6 +487,12 @@ def upload_and_parse(): def dummy(prog=None, msg=""): pass + FACTORY = { + ParserType.PRESENTATION.value: presentation, + ParserType.PICTURE.value: picture, + ParserType.AUDIO.value: audio, + ParserType.EMAIL.value: email + } parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False} exe = ThreadPoolExecutor(max_workers=12) threads = [] @@ -497,7 +503,7 @@ def upload_and_parse(): "from_page": 0, "to_page": 100000 } - threads.append(exe.submit(naive.chunk, d["name"], blob, **kwargs)) + threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs)) for (docinfo,_), th in zip(files, threads): docs = [] @@ -550,7 +556,7 @@ def upload_and_parse(): for doc_id in docids: cks = [c for c in docs if c["doc_id"] == doc_id] - if parser_ids[doc_id] != ParserType.PICTURE.value: + if False and parser_ids[doc_id] != ParserType.PICTURE.value: mindmap = MindMapExtractor(llm_bdl) try: mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2) @@ -564,7 +570,7 @@ def upload_and_parse(): except Exception as e: stat_logger.error("Mind map generation error:", traceback.format_exc()) - vects = embedding(doc_id, cks) + vects = embedding(doc_id, [c["content_with_weight"] for c in cks]) assert len(cks) == len(vects) for i, d in enumerate(cks): v = vects[i] @@ -575,4 +581,4 @@ def upload_and_parse(): DocumentService.increment_chunk_num( doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0) - return get_json_result(data=[d["id"] for d in files]) + return get_json_result(data=[d["id"] for d,_ in files]) diff --git a/api/db/services/api_service.py b/api/db/services/api_service.py index a99f45aea..d65bc3d54 100644 --- a/api/db/services/api_service.py +++ b/api/db/services/api_service.py @@ -46,6 +46,7 @@ class API4ConversationService(CommonService): @classmethod @DB.connection_context() def stats(cls, tenant_id, from_date, to_date, source=None): + if len(to_date) == 10: to_date += " 23:59:59" return cls.model.select( cls.model.create_date.truncate("day").alias("dt"), peewee.fn.COUNT( diff --git a/graphrag/mind_map_extractor.py b/graphrag/mind_map_extractor.py index 5eca9a204..fabdaed08 100644 --- a/graphrag/mind_map_extractor.py +++ b/graphrag/mind_map_extractor.py @@ -113,7 +113,7 @@ class MindMapExtractor: "children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in merge_json.items() if isinstance(v, dict) and self._key(k)]} else: - k = self._key(list(self._be_children.keys())[0]) + k = self._key(list(merge_json.keys())[0]) merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))} except Exception as e: diff --git a/rag/app/naive.py b/rag/app/naive.py index 9c66d7646..73d92523d 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -61,9 +61,8 @@ class Docx(DocxParser): if pn > to_page: break if from_page <= pn < to_page: - current_image = None if p.text.strip(): - if p.style.name == 'Caption': + if p.style and p.style.name == 'Caption': former_image = None if lines and lines[-1][1] and lines[-1][2] != 'Caption': former_image = lines[-1][1].pop()