From 19ded65c6601c6a07dffe7a962e19b82f316b42b Mon Sep 17 00:00:00 2001 From: Kung Quang <1468667058@qq.com> Date: Thu, 8 Aug 2024 12:03:01 +0800 Subject: [PATCH] Fix a "TypeError: expected string or buffer bug" in docx files extracted using Knowledge Graph.#1859 (#1865) ### What problem does this PR solve? Fix a "TypeError: expected string or buffer bug" in docx files extracted using Knowledge Graph. #1859 ``` Traceback (most recent call last): File "//Users/XXX/ragflow/rag/svr/task_executor.py", line 149, in build cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"], ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/XXX/ragflow/rag/app/knowledge_graph.py", line 18, in chunk chunks = build_knowlege_graph_chunks(tenant_id, sections, callback, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/XXX/ragflow/graphrag/index.py", line 87, in build_knowlege_graph_chunks tkn_cnt = num_tokens_from_string(chunks[i]) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/XXX/github/ragflow/rag/utils/__init__.py", line 79, in num_tokens_from_string num_tokens = len(encoder.encode(string)) ^^^^^^^^^^^^^^^^^^^^^^ File "/Users/XXX/tiktoken/core.py", line 116, in encode if match := _special_token_regex(disallowed_special).search(text): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: expected string or buffer ``` This type is `Dict` Pasted Graphic 3 The correct type should be ` Str` Pasted Graphic 2 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): --- rag/app/naive.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rag/app/naive.py b/rag/app/naive.py index b4cfd4015..6e2947241 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -205,6 +205,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, "chunk_token_num", 128)), parser_config.get( "delimiter", "\n!?。;!?")) + if kwargs.get("section_only", False): + return chunks + res.extend(tokenize_chunks_docx(chunks, doc, eng, images)) cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) return res