From 97fe817186a1df7b2af51f2ae0c36d6de95f19bb Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Thu, 22 Feb 2024 17:16:22 +0800 Subject: [PATCH] Fix/upload limit (#2521) Co-authored-by: jyong Co-authored-by: StyleZhang --- api/.env.example | 2 ++ api/config.py | 3 ++ api/core/indexing_runner.py | 17 ++++++++++ api/services/annotation_service.py | 7 ++++ api/services/dataset_service.py | 32 ++++++++++++------ api/services/file_service.py | 4 +-- api/tasks/document_indexing_task.py | 33 ++++++++++++++++++- .../datasets/create/file-uploader/index.tsx | 9 ++++- web/i18n/lang/dataset-creation.en.ts | 1 + web/i18n/lang/dataset-creation.pt.ts | 1 + web/i18n/lang/dataset-creation.uk.ts | 1 + web/i18n/lang/dataset-creation.zh.ts | 1 + 12 files changed, 97 insertions(+), 14 deletions(-) diff --git a/api/.env.example b/api/.env.example index d492c1f8be..89d550ba5a 100644 --- a/api/.env.example +++ b/api/.env.example @@ -130,3 +130,5 @@ UNSTRUCTURED_API_URL= SSRF_PROXY_HTTP_URL= SSRF_PROXY_HTTPS_URL= + +BATCH_UPLOAD_LIMIT=10 \ No newline at end of file diff --git a/api/config.py b/api/config.py index b6a8ce1438..95eabe204f 100644 --- a/api/config.py +++ b/api/config.py @@ -56,6 +56,7 @@ DEFAULTS = { 'BILLING_ENABLED': 'False', 'CAN_REPLACE_LOGO': 'False', 'ETL_TYPE': 'dify', + 'BATCH_UPLOAD_LIMIT': 20 } @@ -285,6 +286,8 @@ class Config: self.BILLING_ENABLED = get_bool_env('BILLING_ENABLED') self.CAN_REPLACE_LOGO = get_bool_env('CAN_REPLACE_LOGO') + self.BATCH_UPLOAD_LIMIT = get_env('BATCH_UPLOAD_LIMIT') + class CloudEditionConfig(Config): diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index a14001d04e..1f80726d5c 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -32,6 +32,7 @@ from models.dataset import Dataset, DatasetProcessRule, DocumentSegment from models.dataset import Document as DatasetDocument from models.model import UploadFile from models.source import DataSourceBinding +from services.feature_service import FeatureService class IndexingRunner: @@ -244,6 +245,14 @@ class IndexingRunner: """ Estimate the indexing for the document. """ + # check document limit + features = FeatureService.get_features(tenant_id) + if features.billing.enabled: + count = len(file_details) + batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT']) + if count > batch_upload_limit: + raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") + embedding_model_instance = None if dataset_id: dataset = Dataset.query.filter_by( @@ -361,6 +370,14 @@ class IndexingRunner: """ Estimate the indexing for the document. """ + # check document limit + features = FeatureService.get_features(tenant_id) + if features.billing.enabled: + count = len(notion_info_list) + batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT']) + if count > batch_upload_limit: + raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") + embedding_model_instance = None if dataset_id: dataset = Dataset.query.filter_by( diff --git a/api/services/annotation_service.py b/api/services/annotation_service.py index 0a9e835586..db4639d40b 100644 --- a/api/services/annotation_service.py +++ b/api/services/annotation_service.py @@ -10,6 +10,7 @@ from werkzeug.exceptions import NotFound from extensions.ext_database import db from extensions.ext_redis import redis_client from models.model import App, AppAnnotationHitHistory, AppAnnotationSetting, Message, MessageAnnotation +from services.feature_service import FeatureService from tasks.annotation.add_annotation_to_index_task import add_annotation_to_index_task from tasks.annotation.batch_import_annotations_task import batch_import_annotations_task from tasks.annotation.delete_annotation_index_task import delete_annotation_index_task @@ -284,6 +285,12 @@ class AppAnnotationService: result.append(content) if len(result) == 0: raise ValueError("The CSV file is empty.") + # check annotation limit + features = FeatureService.get_features(current_user.current_tenant_id) + if features.billing.enabled: + annotation_quota_limit = features.annotation_quota_limit + if annotation_quota_limit.limit < len(result) + annotation_quota_limit.size: + raise ValueError("The number of annotations exceeds the limit of your subscription.") # async job job_id = str(uuid.uuid4()) indexing_cache_key = 'app_annotation_batch_import_{}'.format(str(job_id)) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 66c45ab8da..7fef7e5ec3 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -36,6 +36,7 @@ from services.errors.account import NoPermissionError from services.errors.dataset import DatasetNameDuplicateError from services.errors.document import DocumentIndexingError from services.errors.file import FileNotExistsError +from services.feature_service import FeatureService from services.vector_service import VectorService from tasks.clean_notion_document_task import clean_notion_document_task from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task @@ -452,7 +453,9 @@ class DocumentService: created_from: str = 'web'): # check document limit - if current_app.config['EDITION'] == 'CLOUD': + features = FeatureService.get_features(current_user.current_tenant_id) + + if features.billing.enabled: if 'original_document_id' not in document_data or not document_data['original_document_id']: count = 0 if document_data["data_source"]["type"] == "upload_file": @@ -462,6 +465,9 @@ class DocumentService: notion_info_list = document_data["data_source"]['info_list']['notion_info_list'] for notion_info in notion_info_list: count = count + len(notion_info['pages']) + batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT']) + if count > batch_upload_limit: + raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") # if dataset is empty, update dataset data_source_type if not dataset.data_source_type: dataset.data_source_type = document_data["data_source"]["type"] @@ -741,14 +747,20 @@ class DocumentService: @staticmethod def save_document_without_dataset_id(tenant_id: str, document_data: dict, account: Account): - count = 0 - if document_data["data_source"]["type"] == "upload_file": - upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids'] - count = len(upload_file_list) - elif document_data["data_source"]["type"] == "notion_import": - notion_info_list = document_data["data_source"]['info_list']['notion_info_list'] - for notion_info in notion_info_list: - count = count + len(notion_info['pages']) + features = FeatureService.get_features(current_user.current_tenant_id) + + if features.billing.enabled: + count = 0 + if document_data["data_source"]["type"] == "upload_file": + upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids'] + count = len(upload_file_list) + elif document_data["data_source"]["type"] == "notion_import": + notion_info_list = document_data["data_source"]['info_list']['notion_info_list'] + for notion_info in notion_info_list: + count = count + len(notion_info['pages']) + batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT']) + if count > batch_upload_limit: + raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") embedding_model = None dataset_collection_binding_id = None @@ -1139,7 +1151,7 @@ class SegmentService: segment.answer = args['answer'] if 'keywords' in args and args['keywords']: segment.keywords = args['keywords'] - if'enabled' in args and args['enabled'] is not None: + if 'enabled' in args and args['enabled'] is not None: segment.enabled = args['enabled'] db.session.add(segment) db.session.commit() diff --git a/api/services/file_service.py b/api/services/file_service.py index 215ccf688a..a1c95b0911 100644 --- a/api/services/file_service.py +++ b/api/services/file_service.py @@ -20,9 +20,9 @@ from services.errors.file import FileTooLargeError, UnsupportedFileTypeError IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg'] IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) -ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] + IMAGE_EXTENSIONS +ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', - 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] + IMAGE_EXTENSIONS + 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] PREVIEW_WORDS_LIMIT = 3000 diff --git a/api/tasks/document_indexing_task.py b/api/tasks/document_indexing_task.py index 87081e19e3..b776207050 100644 --- a/api/tasks/document_indexing_task.py +++ b/api/tasks/document_indexing_task.py @@ -4,10 +4,12 @@ import time import click from celery import shared_task +from flask import current_app from core.indexing_runner import DocumentIsPausedException, IndexingRunner from extensions.ext_database import db -from models.dataset import Document +from models.dataset import Dataset, Document +from services.feature_service import FeatureService @shared_task(queue='dataset') @@ -21,6 +23,35 @@ def document_indexing_task(dataset_id: str, document_ids: list): """ documents = [] start_at = time.perf_counter() + + dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first() + + # check document limit + features = FeatureService.get_features(dataset.tenant_id) + try: + if features.billing.enabled: + vector_space = features.vector_space + count = len(document_ids) + batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT']) + if count > batch_upload_limit: + raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.") + if 0 < vector_space.limit <= vector_space.size: + raise ValueError("Your total number of documents plus the number of uploads have over the limit of " + "your subscription.") + except Exception as e: + for document_id in document_ids: + document = db.session.query(Document).filter( + Document.id == document_id, + Document.dataset_id == dataset_id + ).first() + if document: + document.indexing_status = 'error' + document.error = str(e) + document.stopped_at = datetime.datetime.utcnow() + db.session.add(document) + db.session.commit() + return + for document_id in document_ids: logging.info(click.style('Start process document: {}'.format(document_id), fg='green')) diff --git a/web/app/components/datasets/create/file-uploader/index.tsx b/web/app/components/datasets/create/file-uploader/index.tsx index 3b8146a5d9..2ad766e800 100644 --- a/web/app/components/datasets/create/file-uploader/index.tsx +++ b/web/app/components/datasets/create/file-uploader/index.tsx @@ -14,6 +14,8 @@ import { fetchSupportFileTypes } from '@/service/datasets' import I18n from '@/context/i18n' import { LanguagesSupportedUnderscore, getModelRuntimeSupported } from '@/utils/language' +const FILES_NUMBER_LIMIT = 20 + type IFileUploaderProps = { fileList: FileItem[] titleClassName?: string @@ -176,6 +178,11 @@ const FileUploader = ({ if (!files.length) return false + if (files.length + fileList.length > FILES_NUMBER_LIMIT) { + notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.filesNumber', { filesNumber: FILES_NUMBER_LIMIT }) }) + return false + } + const preparedFiles = files.map((file, index) => ({ fileID: `file${index}-${Date.now()}`, file, @@ -185,7 +192,7 @@ const FileUploader = ({ prepareFileList(newFiles) fileListRef.current = newFiles uploadMultipleFiles(preparedFiles) - }, [prepareFileList, uploadMultipleFiles]) + }, [prepareFileList, uploadMultipleFiles, notify, t, fileList]) const handleDragEnter = (e: DragEvent) => { e.preventDefault() diff --git a/web/i18n/lang/dataset-creation.en.ts b/web/i18n/lang/dataset-creation.en.ts index 4112a82e2d..61f32436fd 100644 --- a/web/i18n/lang/dataset-creation.en.ts +++ b/web/i18n/lang/dataset-creation.en.ts @@ -28,6 +28,7 @@ const translation = { typeError: 'File type not supported', size: 'File too large. Maximum is {{size}}MB', count: 'Multiple files not supported', + filesNumber: 'You have reached the batch upload limit of {{filesNumber}}.', }, cancel: 'Cancel', change: 'Change', diff --git a/web/i18n/lang/dataset-creation.pt.ts b/web/i18n/lang/dataset-creation.pt.ts index 6a5a91a945..08018eae61 100644 --- a/web/i18n/lang/dataset-creation.pt.ts +++ b/web/i18n/lang/dataset-creation.pt.ts @@ -28,6 +28,7 @@ const translation = { typeError: 'Tipo de arquivo não suportado', size: 'Arquivo muito grande. Máximo é {{size}}MB', count: 'Vários arquivos não suportados', + filesNumber: 'Limite de upload em massa {{filesNumber}}.', }, cancel: 'Cancelar', change: 'Alterar', diff --git a/web/i18n/lang/dataset-creation.uk.ts b/web/i18n/lang/dataset-creation.uk.ts index 615040f787..7ba648c38f 100644 --- a/web/i18n/lang/dataset-creation.uk.ts +++ b/web/i18n/lang/dataset-creation.uk.ts @@ -28,6 +28,7 @@ const translation = { typeError: 'Тип файлу не підтримується', size: 'Файл занадто великий. Максимум – {{size}} МБ', count: 'Не підтримується завантаження кількох файлів', + filesNumber: 'Ліміт масового завантаження {{filesNumber}}.', }, cancel: 'Скасувати', change: 'Змінити', diff --git a/web/i18n/lang/dataset-creation.zh.ts b/web/i18n/lang/dataset-creation.zh.ts index d96ed45cc7..dc401e1ce2 100644 --- a/web/i18n/lang/dataset-creation.zh.ts +++ b/web/i18n/lang/dataset-creation.zh.ts @@ -28,6 +28,7 @@ const translation = { typeError: '文件类型不支持', size: '文件太大了,不能超过 {{size}}MB', count: '暂不支持多个文件', + filesNumber: '批量上传限制 {{filesNumber}}。', }, cancel: '取消', change: '更改文件',