From 696efe494eef87bd5dc1bb130be302c83e2cc589 Mon Sep 17 00:00:00 2001 From: listeng <1536813+listeng@users.noreply.github.com> Date: Tue, 19 Mar 2024 20:55:15 +0800 Subject: [PATCH] fix: Ignore some emtpy page_content when append to split_documents (#2898) --- .../index_processor/processor/paragraph_index_processor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index 3f0467ee24..5fbc319fd6 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -45,11 +45,12 @@ class ParagraphIndexProcessor(BaseIndexProcessor): # delete Spliter character page_content = document_node.page_content if page_content.startswith(".") or page_content.startswith("。"): - page_content = page_content[1:] + page_content = page_content[1:].strip() else: page_content = page_content - document_node.page_content = page_content - split_documents.append(document_node) + if len(page_content) > 0: + document_node.page_content = page_content + split_documents.append(document_node) all_documents.extend(split_documents) return all_documents