From 2d604d933060432fc51390f42f361357ff130f97 Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Fri, 25 Aug 2023 15:50:29 +0800 Subject: [PATCH] Fix/filter empty segment (#1004) Co-authored-by: jyong --- api/core/indexing_runner.py | 11 ++++++----- api/services/dataset_service.py | 4 ++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 98248874e2..8892f4508e 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -525,12 +525,13 @@ class IndexingRunner: documents = splitter.split_documents([text_doc]) split_documents = [] for document_node in documents: - doc_id = str(uuid.uuid4()) - hash = helper.generate_text_hash(document_node.page_content) - document_node.metadata['doc_id'] = doc_id - document_node.metadata['doc_hash'] = hash - split_documents.append(document_node) + if document_node.page_content.strip(): + doc_id = str(uuid.uuid4()) + hash = helper.generate_text_hash(document_node.page_content) + document_node.metadata['doc_id'] = doc_id + document_node.metadata['doc_hash'] = hash + split_documents.append(document_node) all_documents.extend(split_documents) # processing qa document if document_form == 'qa_model': diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index e64fa881f0..2f9844c636 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -891,6 +891,10 @@ class SegmentService: if document.doc_form == 'qa_model': if 'answer' not in args or not args['answer']: raise ValueError("Answer is required") + if not args['answer'].strip(): + raise ValueError("Answer is empty") + if 'content' not in args or not args['content'] or not args['content'].strip(): + raise ValueError("Content is empty") @classmethod def create_segment(cls, args: dict, document: Document, dataset: Dataset):