diff --git a/.gitignore b/.gitignore index 03920f14a4..957c39cdd1 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,5 @@ docker/volumes/weaviate/* sdks/python-client/build sdks/python-client/dist sdks/python-client/dify_client.egg-info + +.vscode/ \ No newline at end of file diff --git a/api/controllers/console/__init__.py b/api/controllers/console/__init__.py index f3d245895a..89735e83ae 100644 --- a/api/controllers/console/__init__.py +++ b/api/controllers/console/__init__.py @@ -9,7 +9,7 @@ api = ExternalApi(bp) from . import setup, version, apikey, admin # Import app controllers -from .app import app, site, completion, model_config, statistic, conversation, message, generator +from .app import app, site, completion, model_config, statistic, conversation, message, generator, audio # Import auth controllers from .auth import login, oauth, data_source_oauth @@ -21,4 +21,4 @@ from .datasets import datasets, datasets_document, datasets_segments, file, hit_ from .workspace import workspace, members, providers, account # Import explore controllers -from .explore import installed_app, recommended_app, completion, conversation, message, parameter, saved_message +from .explore import installed_app, recommended_app, completion, conversation, message, parameter, saved_message, audio diff --git a/api/controllers/console/app/app.py b/api/controllers/console/app/app.py index 127a568f7f..06d6b442ae 100644 --- a/api/controllers/console/app/app.py +++ b/api/controllers/console/app/app.py @@ -22,6 +22,7 @@ model_config_fields = { 'opening_statement': fields.String, 'suggested_questions': fields.Raw(attribute='suggested_questions_list'), 'suggested_questions_after_answer': fields.Raw(attribute='suggested_questions_after_answer_dict'), + 'speech_to_text': fields.Raw(attribute='speech_to_text_dict'), 'more_like_this': fields.Raw(attribute='more_like_this_dict'), 'model': fields.Raw(attribute='model_dict'), 'user_input_form': fields.Raw(attribute='user_input_form_list'), @@ -144,6 +145,7 @@ class AppListApi(Resource): opening_statement=model_configuration['opening_statement'], suggested_questions=json.dumps(model_configuration['suggested_questions']), suggested_questions_after_answer=json.dumps(model_configuration['suggested_questions_after_answer']), + speech_to_text=json.dumps(model_configuration['speech_to_text']), more_like_this=json.dumps(model_configuration['more_like_this']), model=json.dumps(model_configuration['model']), user_input_form=json.dumps(model_configuration['user_input_form']), @@ -434,6 +436,7 @@ class AppCopy(Resource): opening_statement=app_config.opening_statement, suggested_questions=app_config.suggested_questions, suggested_questions_after_answer=app_config.suggested_questions_after_answer, + speech_to_text=app_config.speech_to_text, more_like_this=app_config.more_like_this, model=app_config.model, user_input_form=app_config.user_input_form, diff --git a/api/controllers/console/app/audio.py b/api/controllers/console/app/audio.py new file mode 100644 index 0000000000..e6bd2fdf28 --- /dev/null +++ b/api/controllers/console/app/audio.py @@ -0,0 +1,69 @@ +# -*- coding:utf-8 -*- +import logging + +from flask import request +from flask_login import login_required +from werkzeug.exceptions import InternalServerError, NotFound + +import services +from controllers.console import api +from controllers.console.app import _get_app +from controllers.console.app.error import AppUnavailableError, \ + ProviderNotInitializeError, CompletionRequestError, ProviderQuotaExceededError, \ + ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, \ + UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError +from controllers.console.setup import setup_required +from controllers.console.wraps import account_initialization_required +from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \ + LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError +from flask_restful import Resource +from services.audio_service import AudioService +from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \ + UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError + + +class ChatMessageAudioApi(Resource): + @setup_required + @login_required + @account_initialization_required + def post(self, app_id): + app_id = str(app_id) + app_model = _get_app(app_id, 'chat') + + file = request.files['file'] + + try: + response = AudioService.transcript( + tenant_id=app_model.tenant_id, + file=file, + ) + + return response + except services.errors.app_model_config.AppModelConfigBrokenError: + logging.exception("App model config broken.") + raise AppUnavailableError() + except NoAudioUploadedServiceError: + raise NoAudioUploadedError() + except AudioTooLargeServiceError as e: + raise AudioTooLargeError(str(e)) + except UnsupportedAudioTypeServiceError: + raise UnsupportedAudioTypeError() + except ProviderNotSupportSpeechToTextServiceError: + raise ProviderNotSupportSpeechToTextError() + except ProviderTokenNotInitError: + raise ProviderNotInitializeError() + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError, + LLMRateLimitError, LLMAuthorizationError) as e: + raise CompletionRequestError(str(e)) + except ValueError as e: + raise e + except Exception as e: + logging.exception("internal server error.") + raise InternalServerError() + + +api.add_resource(ChatMessageAudioApi, '/apps//audio-to-text') \ No newline at end of file diff --git a/api/controllers/console/app/error.py b/api/controllers/console/app/error.py index 8923f90bf8..905e5b273a 100644 --- a/api/controllers/console/app/error.py +++ b/api/controllers/console/app/error.py @@ -49,3 +49,27 @@ class AppMoreLikeThisDisabledError(BaseHTTPException): error_code = 'app_more_like_this_disabled' description = "The 'More like this' feature is disabled. Please refresh your page." code = 403 + + +class NoAudioUploadedError(BaseHTTPException): + error_code = 'no_audio_uploaded' + description = "Please upload your audio." + code = 400 + + +class AudioTooLargeError(BaseHTTPException): + error_code = 'audio_too_large' + description = "Audio size exceeded. {message}" + code = 413 + + +class UnsupportedAudioTypeError(BaseHTTPException): + error_code = 'unsupported_audio_type' + description = "Audio type not allowed." + code = 415 + + +class ProviderNotSupportSpeechToTextError(BaseHTTPException): + error_code = 'provider_not_support_speech_to_text' + description = "Provider not support speech to text." + code = 400 \ No newline at end of file diff --git a/api/controllers/console/app/model_config.py b/api/controllers/console/app/model_config.py index adc10d6609..87bd70f4fa 100644 --- a/api/controllers/console/app/model_config.py +++ b/api/controllers/console/app/model_config.py @@ -41,6 +41,7 @@ class ModelConfigResource(Resource): opening_statement=model_configuration['opening_statement'], suggested_questions=json.dumps(model_configuration['suggested_questions']), suggested_questions_after_answer=json.dumps(model_configuration['suggested_questions_after_answer']), + speech_to_text=json.dumps(model_configuration['speech_to_text']), more_like_this=json.dumps(model_configuration['more_like_this']), model=json.dumps(model_configuration['model']), user_input_form=json.dumps(model_configuration['user_input_form']), diff --git a/api/controllers/console/explore/audio.py b/api/controllers/console/explore/audio.py new file mode 100644 index 0000000000..a027fe625e --- /dev/null +++ b/api/controllers/console/explore/audio.py @@ -0,0 +1,66 @@ +# -*- coding:utf-8 -*- +import logging + +from flask import request +from werkzeug.exceptions import InternalServerError + +import services +from controllers.console import api +from controllers.console.app.error import AppUnavailableError, ProviderNotInitializeError, \ + ProviderQuotaExceededError, ProviderModelCurrentlyNotSupportError, CompletionRequestError, \ + NoAudioUploadedError, AudioTooLargeError, \ + UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError +from controllers.console.explore.wraps import InstalledAppResource +from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \ + LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError +from services.audio_service import AudioService +from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \ + UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError +from models.model import AppModelConfig + + +class ChatAudioApi(InstalledAppResource): + def post(self, installed_app): + app_model = installed_app.app + app_model_config: AppModelConfig = app_model.app_model_config + + if not app_model_config.speech_to_text_dict['enabled']: + raise AppUnavailableError() + + file = request.files['file'] + + try: + response = AudioService.transcript( + tenant_id=app_model.tenant_id, + file=file, + ) + + return response + except services.errors.app_model_config.AppModelConfigBrokenError: + logging.exception("App model config broken.") + raise AppUnavailableError() + except NoAudioUploadedServiceError: + raise NoAudioUploadedError() + except AudioTooLargeServiceError as e: + raise AudioTooLargeError(str(e)) + except UnsupportedAudioTypeServiceError: + raise UnsupportedAudioTypeError() + except ProviderNotSupportSpeechToTextServiceError: + raise ProviderNotSupportSpeechToTextError() + except ProviderTokenNotInitError: + raise ProviderNotInitializeError() + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError, + LLMRateLimitError, LLMAuthorizationError) as e: + raise CompletionRequestError(str(e)) + except ValueError as e: + raise e + except Exception as e: + logging.exception("internal server error.") + raise InternalServerError() + + +api.add_resource(ChatAudioApi, '/installed-apps//audio-to-text', endpoint='installed_app_audio') \ No newline at end of file diff --git a/api/controllers/console/explore/parameter.py b/api/controllers/console/explore/parameter.py index 2d0459ed40..44f6d81e1e 100644 --- a/api/controllers/console/explore/parameter.py +++ b/api/controllers/console/explore/parameter.py @@ -21,6 +21,7 @@ class AppParameterApi(InstalledAppResource): 'opening_statement': fields.String, 'suggested_questions': fields.Raw, 'suggested_questions_after_answer': fields.Raw, + 'speech_to_text': fields.Raw, 'more_like_this': fields.Raw, 'user_input_form': fields.Raw, } @@ -35,6 +36,7 @@ class AppParameterApi(InstalledAppResource): 'opening_statement': app_model_config.opening_statement, 'suggested_questions': app_model_config.suggested_questions_list, 'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict, + 'speech_to_text': app_model_config.speech_to_text_dict, 'more_like_this': app_model_config.more_like_this_dict, 'user_input_form': app_model_config.user_input_form_list } diff --git a/api/controllers/service_api/__init__.py b/api/controllers/service_api/__init__.py index 05318a2076..ce5b44fc22 100644 --- a/api/controllers/service_api/__init__.py +++ b/api/controllers/service_api/__init__.py @@ -7,6 +7,6 @@ bp = Blueprint('service_api', __name__, url_prefix='/v1') api = ExternalApi(bp) -from .app import completion, app, conversation, message +from .app import completion, app, conversation, message, audio from .dataset import document diff --git a/api/controllers/service_api/app/app.py b/api/controllers/service_api/app/app.py index 08532441c8..2b0b11c14f 100644 --- a/api/controllers/service_api/app/app.py +++ b/api/controllers/service_api/app/app.py @@ -22,6 +22,7 @@ class AppParameterApi(AppApiResource): 'opening_statement': fields.String, 'suggested_questions': fields.Raw, 'suggested_questions_after_answer': fields.Raw, + 'speech_to_text': fields.Raw, 'more_like_this': fields.Raw, 'user_input_form': fields.Raw, } @@ -35,6 +36,7 @@ class AppParameterApi(AppApiResource): 'opening_statement': app_model_config.opening_statement, 'suggested_questions': app_model_config.suggested_questions_list, 'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict, + 'speech_to_text': app_model_config.speech_to_text_dict, 'more_like_this': app_model_config.more_like_this_dict, 'user_input_form': app_model_config.user_input_form_list } diff --git a/api/controllers/service_api/app/audio.py b/api/controllers/service_api/app/audio.py new file mode 100644 index 0000000000..aacf1ca2a2 --- /dev/null +++ b/api/controllers/service_api/app/audio.py @@ -0,0 +1,61 @@ +import logging + +from flask import request +from werkzeug.exceptions import InternalServerError + +import services +from controllers.service_api import api +from controllers.service_api.app.error import AppUnavailableError, ProviderNotInitializeError, CompletionRequestError, ProviderQuotaExceededError, \ + ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, UnsupportedAudioTypeError, \ + ProviderNotSupportSpeechToTextError +from controllers.service_api.wraps import AppApiResource +from core.llm.error import LLMBadRequestError, LLMAuthorizationError, LLMAPIUnavailableError, LLMAPIConnectionError, \ + LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError +from models.model import App, AppModelConfig +from services.audio_service import AudioService +from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \ + UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError + +class AudioApi(AppApiResource): + def post(self, app_model: App, end_user): + app_model_config: AppModelConfig = app_model.app_model_config + + if not app_model_config.speech_to_text_dict['enabled']: + raise AppUnavailableError() + + file = request.files['file'] + + try: + response = AudioService.transcript( + tenant_id=app_model.tenant_id, + file=file, + ) + + return response + except services.errors.app_model_config.AppModelConfigBrokenError: + logging.exception("App model config broken.") + raise AppUnavailableError() + except NoAudioUploadedServiceError: + raise NoAudioUploadedError() + except AudioTooLargeServiceError as e: + raise AudioTooLargeError(str(e)) + except UnsupportedAudioTypeServiceError: + raise UnsupportedAudioTypeError() + except ProviderNotSupportSpeechToTextServiceError: + raise ProviderNotSupportSpeechToTextError() + except ProviderTokenNotInitError: + raise ProviderNotInitializeError() + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError, + LLMRateLimitError, LLMAuthorizationError) as e: + raise CompletionRequestError(str(e)) + except ValueError as e: + raise e + except Exception as e: + logging.exception("internal server error.") + raise InternalServerError() + +api.add_resource(AudioApi, '/audio-to-text') \ No newline at end of file diff --git a/api/controllers/service_api/app/error.py b/api/controllers/service_api/app/error.py index b7f6e0f6fa..f509dc1b48 100644 --- a/api/controllers/service_api/app/error.py +++ b/api/controllers/service_api/app/error.py @@ -51,3 +51,27 @@ class CompletionRequestError(BaseHTTPException): description = "Completion request failed." code = 400 + +class NoAudioUploadedError(BaseHTTPException): + error_code = 'no_audio_uploaded' + description = "Please upload your audio." + code = 400 + + +class AudioTooLargeError(BaseHTTPException): + error_code = 'audio_too_large' + description = "Audio size exceeded. {message}" + code = 413 + + +class UnsupportedAudioTypeError(BaseHTTPException): + error_code = 'unsupported_audio_type' + description = "Audio type not allowed." + code = 415 + + +class ProviderNotSupportSpeechToTextError(BaseHTTPException): + error_code = 'provider_not_support_speech_to_text' + description = "Provider not support speech to text." + code = 400 + diff --git a/api/controllers/web/__init__.py b/api/controllers/web/__init__.py index b793f11014..c419453665 100644 --- a/api/controllers/web/__init__.py +++ b/api/controllers/web/__init__.py @@ -7,4 +7,4 @@ bp = Blueprint('web', __name__, url_prefix='/api') api = ExternalApi(bp) -from . import completion, app, conversation, message, site, saved_message +from . import completion, app, conversation, message, site, saved_message, audio diff --git a/api/controllers/web/app.py b/api/controllers/web/app.py index 1396531111..2122f01a7f 100644 --- a/api/controllers/web/app.py +++ b/api/controllers/web/app.py @@ -21,6 +21,7 @@ class AppParameterApi(WebApiResource): 'opening_statement': fields.String, 'suggested_questions': fields.Raw, 'suggested_questions_after_answer': fields.Raw, + 'speech_to_text': fields.Raw, 'more_like_this': fields.Raw, 'user_input_form': fields.Raw, } @@ -34,6 +35,7 @@ class AppParameterApi(WebApiResource): 'opening_statement': app_model_config.opening_statement, 'suggested_questions': app_model_config.suggested_questions_list, 'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict, + 'speech_to_text': app_model_config.speech_to_text_dict, 'more_like_this': app_model_config.more_like_this_dict, 'user_input_form': app_model_config.user_input_form_list } diff --git a/api/controllers/web/audio.py b/api/controllers/web/audio.py new file mode 100644 index 0000000000..b07382176c --- /dev/null +++ b/api/controllers/web/audio.py @@ -0,0 +1,63 @@ +# -*- coding:utf-8 -*- +import logging + +from flask import request +from werkzeug.exceptions import InternalServerError + +import services +from controllers.web import api +from controllers.web.error import AppUnavailableError, ProviderNotInitializeError, CompletionRequestError, \ + ProviderQuotaExceededError, ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, \ + UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError +from controllers.web.wraps import WebApiResource +from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \ + LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError +from services.audio_service import AudioService +from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \ + UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError +from models.model import App, AppModelConfig + + +class AudioApi(WebApiResource): + def post(self, app_model: App, end_user): + app_model_config: AppModelConfig = app_model.app_model_config + + if not app_model_config.speech_to_text_dict['enabled']: + raise AppUnavailableError() + + file = request.files['file'] + + try: + response = AudioService.transcript( + tenant_id=app_model.tenant_id, + file=file, + ) + + return response + except services.errors.app_model_config.AppModelConfigBrokenError: + logging.exception("App model config broken.") + raise AppUnavailableError() + except NoAudioUploadedServiceError: + raise NoAudioUploadedError() + except AudioTooLargeServiceError as e: + raise AudioTooLargeError(str(e)) + except UnsupportedAudioTypeServiceError: + raise UnsupportedAudioTypeError() + except ProviderNotSupportSpeechToTextServiceError: + raise ProviderNotSupportSpeechToTextError() + except ProviderTokenNotInitError: + raise ProviderNotInitializeError() + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError, + LLMRateLimitError, LLMAuthorizationError) as e: + raise CompletionRequestError(str(e)) + except ValueError as e: + raise e + except Exception as e: + logging.exception("internal server error.") + raise InternalServerError() + +api.add_resource(AudioApi, '/audio-to-text') \ No newline at end of file diff --git a/api/controllers/web/error.py b/api/controllers/web/error.py index fdfe36f6d1..c66bbd85b2 100644 --- a/api/controllers/web/error.py +++ b/api/controllers/web/error.py @@ -62,3 +62,27 @@ class AppSuggestedQuestionsAfterAnswerDisabledError(BaseHTTPException): error_code = 'app_suggested_questions_after_answer_disabled' description = "The 'Suggested Questions After Answer' feature is disabled. Please refresh your page." code = 403 + + +class NoAudioUploadedError(BaseHTTPException): + error_code = 'no_audio_uploaded' + description = "Please upload your audio." + code = 400 + + +class AudioTooLargeError(BaseHTTPException): + error_code = 'audio_too_large' + description = "Audio size exceeded. {message}" + code = 413 + + +class UnsupportedAudioTypeError(BaseHTTPException): + error_code = 'unsupported_audio_type' + description = "Audio type not allowed." + code = 415 + + +class ProviderNotSupportSpeechToTextError(BaseHTTPException): + error_code = 'provider_not_support_speech_to_text' + description = "Provider not support speech to text." + code = 400 \ No newline at end of file diff --git a/api/core/llm/whisper.py b/api/core/llm/whisper.py new file mode 100644 index 0000000000..3ad993be17 --- /dev/null +++ b/api/core/llm/whisper.py @@ -0,0 +1,25 @@ +import openai +from models.provider import ProviderName +from core.llm.error_handle_wraps import handle_llm_exceptions +from core.llm.provider.base import BaseProvider + + +class Whisper: + + def __init__(self, provider: BaseProvider): + self.provider = provider + + if self.provider.get_provider_name() == ProviderName.OPENAI: + self.client = openai.Audio + self.credentials = provider.get_credentials() + + @handle_llm_exceptions + def transcribe(self, file): + return self.client.transcribe( + model='whisper-1', + file=file, + api_key=self.credentials.get('openai_api_key'), + api_base=self.credentials.get('openai_api_base'), + api_type=self.credentials.get('openai_api_type'), + api_version=self.credentials.get('openai_api_version'), + ) diff --git a/api/migrations/versions/a5b56fb053ef_app_config_add_speech_to_text.py b/api/migrations/versions/a5b56fb053ef_app_config_add_speech_to_text.py new file mode 100644 index 0000000000..b197f8f5a8 --- /dev/null +++ b/api/migrations/versions/a5b56fb053ef_app_config_add_speech_to_text.py @@ -0,0 +1,32 @@ +"""app config add speech_to_text + +Revision ID: a5b56fb053ef +Revises: d3d503a3471c +Create Date: 2023-07-06 17:55:20.894149 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'a5b56fb053ef' +down_revision = 'd3d503a3471c' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('app_model_configs', schema=None) as batch_op: + batch_op.add_column(sa.Column('speech_to_text', sa.Text(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('app_model_configs', schema=None) as batch_op: + batch_op.drop_column('speech_to_text') + + # ### end Alembic commands ### diff --git a/api/models/model.py b/api/models/model.py index d47334fe90..ec4197e588 100644 --- a/api/models/model.py +++ b/api/models/model.py @@ -81,6 +81,7 @@ class AppModelConfig(db.Model): opening_statement = db.Column(db.Text) suggested_questions = db.Column(db.Text) suggested_questions_after_answer = db.Column(db.Text) + speech_to_text = db.Column(db.Text) more_like_this = db.Column(db.Text) model = db.Column(db.Text) user_input_form = db.Column(db.Text) @@ -104,6 +105,11 @@ class AppModelConfig(db.Model): def suggested_questions_after_answer_dict(self) -> dict: return json.loads(self.suggested_questions_after_answer) if self.suggested_questions_after_answer \ else {"enabled": False} + + @property + def speech_to_text_dict(self) -> dict: + return json.loads(self.speech_to_text) if self.speech_to_text \ + else {"enabled": False} @property def more_like_this_dict(self) -> dict: @@ -223,6 +229,9 @@ class Conversation(db.Model): model_config['suggested_questions_after_answer'] = override_model_configs[ 'suggested_questions_after_answer'] \ if 'suggested_questions_after_answer' in override_model_configs else {"enabled": False} + model_config['speech_to_text'] = override_model_configs[ + 'speech_to_text'] \ + if 'speech_to_text' in override_model_configs else {"enabled": False} model_config['more_like_this'] = override_model_configs['more_like_this'] \ if 'more_like_this' in override_model_configs else {"enabled": False} model_config['user_input_form'] = override_model_configs['user_input_form'] @@ -239,6 +248,7 @@ class Conversation(db.Model): model_config['opening_statement'] = app_model_config.opening_statement model_config['suggested_questions'] = app_model_config.suggested_questions_list model_config['suggested_questions_after_answer'] = app_model_config.suggested_questions_after_answer_dict + model_config['speech_to_text'] = app_model_config.speech_to_text_dict model_config['more_like_this'] = app_model_config.more_like_this_dict model_config['user_input_form'] = app_model_config.user_input_form_list diff --git a/api/services/app_model_config_service.py b/api/services/app_model_config_service.py index f2ab184573..77293a1a05 100644 --- a/api/services/app_model_config_service.py +++ b/api/services/app_model_config_service.py @@ -4,6 +4,7 @@ import uuid from core.constant import llm_constant from models.account import Account from services.dataset_service import DatasetService +from core.llm.llm_builder import LLMBuilder class AppModelConfigService: @@ -109,6 +110,26 @@ class AppModelConfigService: if not isinstance(config["suggested_questions_after_answer"]["enabled"], bool): raise ValueError("enabled in suggested_questions_after_answer must be of boolean type") + # speech_to_text + if 'speech_to_text' not in config or not config["speech_to_text"]: + config["speech_to_text"] = { + "enabled": False + } + + if not isinstance(config["speech_to_text"], dict): + raise ValueError("speech_to_text must be of dict type") + + if "enabled" not in config["speech_to_text"] or not config["speech_to_text"]["enabled"]: + config["speech_to_text"]["enabled"] = False + + if not isinstance(config["speech_to_text"]["enabled"], bool): + raise ValueError("enabled in speech_to_text must be of boolean type") + + provider_name = LLMBuilder.get_default_provider(account.current_tenant_id) + + if config["speech_to_text"]["enabled"] and provider_name != 'openai': + raise ValueError("provider not support speech to text") + # more_like_this if 'more_like_this' not in config or not config["more_like_this"]: config["more_like_this"] = { @@ -277,6 +298,7 @@ class AppModelConfigService: "opening_statement": config["opening_statement"], "suggested_questions": config["suggested_questions"], "suggested_questions_after_answer": config["suggested_questions_after_answer"], + "speech_to_text": config["speech_to_text"], "more_like_this": config["more_like_this"], "model": { "provider": config["model"]["provider"], diff --git a/api/services/audio_service.py b/api/services/audio_service.py new file mode 100644 index 0000000000..4506f787db --- /dev/null +++ b/api/services/audio_service.py @@ -0,0 +1,43 @@ +import io +from werkzeug.datastructures import FileStorage +from core.llm.llm_builder import LLMBuilder +from core.llm.provider.llm_provider_service import LLMProviderService +from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError +from core.llm.whisper import Whisper +from models.provider import ProviderName + +FILE_SIZE_LIMIT = 1 * 1024 * 1024 +ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm'] + +class AudioService: + @classmethod + def transcript(cls, tenant_id: str, file: FileStorage): + if file is None: + raise NoAudioUploadedServiceError() + + extension = file.mimetype + if extension not in [f'audio/{ext}' for ext in ALLOWED_EXTENSIONS]: + raise UnsupportedAudioTypeServiceError() + + file_content = file.read() + file_size = len(file_content) + + if file_size > FILE_SIZE_LIMIT: + message = f"({file_size} > {FILE_SIZE_LIMIT})" + raise AudioTooLargeServiceError(message) + + provider_name = LLMBuilder.get_default_provider(tenant_id) + if provider_name != ProviderName.OPENAI.value: + raise ProviderNotSupportSpeechToTextServiceError('haha') + + provider_service = LLMProviderService(tenant_id, provider_name) + + buffer = io.BytesIO(file_content) + buffer.name = 'temp.wav' + + return Whisper(provider_service.provider).transcribe(buffer) + + + + + \ No newline at end of file diff --git a/api/services/errors/__init__.py b/api/services/errors/__init__.py index fe77ca86b6..1f01e033ca 100644 --- a/api/services/errors/__init__.py +++ b/api/services/errors/__init__.py @@ -1,7 +1,7 @@ # -*- coding:utf-8 -*- __all__ = [ 'base', 'conversation', 'message', 'index', 'app_model_config', 'account', 'document', 'dataset', - 'app', 'completion' + 'app', 'completion', 'audio' ] from . import * diff --git a/api/services/errors/audio.py b/api/services/errors/audio.py new file mode 100644 index 0000000000..0f67fa1b61 --- /dev/null +++ b/api/services/errors/audio.py @@ -0,0 +1,23 @@ +from services.errors.base import BaseServiceError + +class NoAudioUploadedServiceError(BaseServiceError): + error_code = 'no_audio_uploaded' + description = "Please upload your audio." + code = 400 + + +class AudioTooLargeServiceError(BaseServiceError): + error_code = 'audio_too_large' + description = "Audio size exceeded. {message}" + code = 413 + + +class UnsupportedAudioTypeServiceError(BaseServiceError): + error_code = 'unsupported_audio_type' + description = "Audio type not allowed." + code = 415 + +class ProviderNotSupportSpeechToTextServiceError(BaseServiceError): + error_code = 'provider_not_support_speech_to_text' + description = "Provider not support speech to text. {message}" + code = 400 \ No newline at end of file diff --git a/web/app/components/app/chat/index.tsx b/web/app/components/app/chat/index.tsx index 6f1f82c7e5..f67805cbf4 100644 --- a/web/app/components/app/chat/index.tsx +++ b/web/app/components/app/chat/index.tsx @@ -3,6 +3,7 @@ import type { FC } from 'react' import React, { useEffect, useLayoutEffect, useRef, useState } from 'react' import { useContext } from 'use-context-selector' import cn from 'classnames' +import Recorder from 'js-audio-recorder' import { HandThumbDownIcon, HandThumbUpIcon } from '@heroicons/react/24/outline' import { UserCircleIcon } from '@heroicons/react/24/solid' import { useTranslation } from 'react-i18next' @@ -19,6 +20,10 @@ import AppContext from '@/context/app-context' import { Markdown } from '@/app/components/base/markdown' import { formatNumber } from '@/utils/format' import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' +import VoiceInput from '@/app/components/base/voice-input' +import { Microphone01 } from '@/app/components/base/icons/src/vender/line/mediaAndDevices' +import { Microphone01 as Microphone01Solid } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices' +import { XCircle } from '@/app/components/base/icons/src/vender/solid/general' const stopIcon = ( @@ -59,6 +64,7 @@ export type IChatProps = { controlFocus?: number isShowSuggestion?: boolean suggestionList?: string[] + isShowSpeechToText?: boolean } export type MessageMore = { @@ -421,6 +427,7 @@ const Chat: FC = ({ controlFocus, isShowSuggestion, suggestionList, + isShowSpeechToText, }) => { const { t } = useTranslation() const { notify } = useContext(ToastContext) @@ -488,6 +495,15 @@ const Chat: FC = ({ } }, [suggestionList]) + const [voiceInputShow, setVoiceInputShow] = useState(false) + const handleVoiceInputShow = () => { + (Recorder as any).getPermission().then(() => { + setVoiceInputShow(true) + }, () => { + logError(t('common.voiceInput.notAllow')) + }) + } + return (
{/* Chat List */} @@ -565,6 +581,26 @@ const Chat: FC = ({ />
{query.trim().length}
+ { + query + ? ( +
setQuery('')}> + +
+ ) + : isShowSpeechToText + ? ( +
+ + +
+ ) + : null + } +
{isMobile ? sendBtn : ( @@ -581,6 +617,14 @@ const Chat: FC = ({ )}
+ { + voiceInputShow && ( + setVoiceInputShow(false)} + onConverted={text => setQuery(text)} + /> + ) + }
) diff --git a/web/app/components/app/chat/style.module.css b/web/app/components/app/chat/style.module.css index 9dc5342789..f5e853110f 100644 --- a/web/app/components/app/chat/style.module.css +++ b/web/app/components/app/chat/style.module.css @@ -79,7 +79,7 @@ .textArea { padding-top: 13px; padding-bottom: 13px; - padding-right: 90px; + padding-right: 130px; border-radius: 12px; line-height: 20px; background-color: #fff; diff --git a/web/app/components/app/configuration/config/feature/choose-feature/feature-item/preview-imgs/speech-to-text.svg b/web/app/components/app/configuration/config/feature/choose-feature/feature-item/preview-imgs/speech-to-text.svg new file mode 100644 index 0000000000..029b92fee4 --- /dev/null +++ b/web/app/components/app/configuration/config/feature/choose-feature/feature-item/preview-imgs/speech-to-text.svg @@ -0,0 +1,100 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/web/app/components/app/configuration/config/feature/choose-feature/feature-item/style.module.css b/web/app/components/app/configuration/config/feature/choose-feature/feature-item/style.module.css index c11fc217c8..60c7621d32 100644 --- a/web/app/components/app/configuration/config/feature/choose-feature/feature-item/style.module.css +++ b/web/app/components/app/configuration/config/feature/choose-feature/feature-item/style.module.css @@ -22,4 +22,8 @@ .moreLikeThisPreview { background-image: url(./preview-imgs/more-like-this.svg); +} + +.speechToTextPreview { + background-image: url(./preview-imgs/speech-to-text.svg); } \ No newline at end of file diff --git a/web/app/components/app/configuration/config/feature/choose-feature/index.tsx b/web/app/components/app/configuration/config/feature/choose-feature/index.tsx index ab7badb1a2..27c11ba81b 100644 --- a/web/app/components/app/configuration/config/feature/choose-feature/index.tsx +++ b/web/app/components/app/configuration/config/feature/choose-feature/index.tsx @@ -7,10 +7,12 @@ import MoreLikeThisIcon from '../../../base/icons/more-like-this-icon' import FeatureItem from './feature-item' import Modal from '@/app/components/base/modal' import SuggestedQuestionsAfterAnswerIcon from '@/app/components/app/configuration/base/icons/suggested-questions-after-answer-icon' +import { Microphone01 } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices' type IConfig = { openingStatement: boolean moreLikeThis: boolean suggestedQuestionsAfterAnswer: boolean + speechToText: boolean } export type IChooseFeatureProps = { @@ -19,6 +21,7 @@ export type IChooseFeatureProps = { config: IConfig isChatApp: boolean onChange: (key: string, value: boolean) => void + showSpeechToTextItem?: boolean } const OpeningStatementIcon = ( @@ -33,6 +36,7 @@ const ChooseFeature: FC = ({ isChatApp, config, onChange, + showSpeechToTextItem, }) => { const { t } = useTranslation() @@ -69,6 +73,18 @@ const ChooseFeature: FC = ({ value={config.suggestedQuestionsAfterAnswer} onChange={value => onChange('suggestedQuestionsAfterAnswer', value)} /> + { + showSpeechToTextItem && ( + } + previewImgClassName='speechToTextPreview' + title={t('appDebug.feature.speechToText.title')} + description={t('appDebug.feature.speechToText.description')} + value={config.speechToText} + onChange={value => onChange('speechToText', value)} + /> + ) + } )} diff --git a/web/app/components/app/configuration/config/feature/use-feature.tsx b/web/app/components/app/configuration/config/feature/use-feature.tsx index c88f7c8fe5..48559450ae 100644 --- a/web/app/components/app/configuration/config/feature/use-feature.tsx +++ b/web/app/components/app/configuration/config/feature/use-feature.tsx @@ -7,6 +7,8 @@ function useFeature({ setMoreLikeThis, suggestedQuestionsAfterAnswer, setSuggestedQuestionsAfterAnswer, + speechToText, + setSpeechToText, }: { introduction: string setIntroduction: (introduction: string) => void @@ -14,13 +16,14 @@ function useFeature({ setMoreLikeThis: (moreLikeThis: boolean) => void suggestedQuestionsAfterAnswer: boolean setSuggestedQuestionsAfterAnswer: (suggestedQuestionsAfterAnswer: boolean) => void + speechToText: boolean + setSpeechToText: (speechToText: boolean) => void }) { const [tempshowOpeningStatement, setTempShowOpeningStatement] = React.useState(!!introduction) useEffect(() => { // wait to api data back - if (!!introduction) { + if (introduction) setTempShowOpeningStatement(true) - } }, [introduction]) // const [tempMoreLikeThis, setTempMoreLikeThis] = React.useState(moreLikeThis) @@ -30,15 +33,16 @@ function useFeature({ const featureConfig = { openingStatement: tempshowOpeningStatement, - moreLikeThis: moreLikeThis, - suggestedQuestionsAfterAnswer: suggestedQuestionsAfterAnswer + moreLikeThis, + suggestedQuestionsAfterAnswer, + speechToText, } const handleFeatureChange = (key: string, value: boolean) => { switch (key) { case 'openingStatement': - if (!value) { + if (!value) setIntroduction('') - } + setTempShowOpeningStatement(value) break case 'moreLikeThis': @@ -47,12 +51,14 @@ function useFeature({ case 'suggestedQuestionsAfterAnswer': setSuggestedQuestionsAfterAnswer(value) break + case 'speechToText': + setSpeechToText(value) } } return { featureConfig, - handleFeatureChange + handleFeatureChange, } } -export default useFeature \ No newline at end of file +export default useFeature diff --git a/web/app/components/app/configuration/config/index.tsx b/web/app/components/app/configuration/config/index.tsx index 488d305bea..9615e01716 100644 --- a/web/app/components/app/configuration/config/index.tsx +++ b/web/app/components/app/configuration/config/index.tsx @@ -4,6 +4,7 @@ import React from 'react' import { useContext } from 'use-context-selector' import produce from 'immer' import { useBoolean } from 'ahooks' +import useSWR from 'swr' import DatasetConfig from '../dataset-config' import ChatGroup from '../features/chat-group' import ExperienceEnchanceGroup from '../features/experience-enchance-group' @@ -19,6 +20,7 @@ import ConfigPrompt from '@/app/components/app/configuration/config-prompt' import ConfigVar from '@/app/components/app/configuration/config-var' import type { PromptVariable } from '@/models/debug' import { AppType } from '@/types/app' +import { fetchTenantInfo } from '@/service/common' const Config: FC = () => { const { @@ -33,8 +35,12 @@ const Config: FC = () => { setMoreLikeThisConfig, suggestedQuestionsAfterAnswerConfig, setSuggestedQuestionsAfterAnswerConfig, + speechToTextConfig, + setSpeechToTextConfig, } = useContext(ConfigContext) const isChatApp = mode === AppType.chat + const { data: userInfo } = useSWR({ url: '/info' }, fetchTenantInfo) + const targetProvider = userInfo?.providers?.find(({ token_is_set, is_valid }) => token_is_set && is_valid) const promptTemplate = modelConfig.configs.prompt_template const promptVariables = modelConfig.configs.prompt_variables @@ -78,9 +84,15 @@ const Config: FC = () => { draft.enabled = value })) }, + speechToText: speechToTextConfig.enabled, + setSpeechToText: (value) => { + setSpeechToTextConfig(produce(speechToTextConfig, (draft) => { + draft.enabled = value + })) + }, }) - const hasChatConfig = isChatApp && (featureConfig.openingStatement || featureConfig.suggestedQuestionsAfterAnswer) + const hasChatConfig = isChatApp && (featureConfig.openingStatement || featureConfig.suggestedQuestionsAfterAnswer || (featureConfig.speechToText && targetProvider?.provider_name === 'openai')) const hasToolbox = false const [showAutomatic, { setTrue: showAutomaticTrue, setFalse: showAutomaticFalse }] = useBoolean(false) @@ -110,6 +122,7 @@ const Config: FC = () => { isChatApp={isChatApp} config={featureConfig} onChange={handleFeatureChange} + showSpeechToTextItem={targetProvider?.provider_name === 'openai'} /> )} {showAutomatic && ( @@ -149,6 +162,7 @@ const Config: FC = () => { } } isShowSuggestedQuestionsAfterAnswer={featureConfig.suggestedQuestionsAfterAnswer} + isShowSpeechText={featureConfig.speechToText} /> ) } diff --git a/web/app/components/app/configuration/debug/index.tsx b/web/app/components/app/configuration/debug/index.tsx index 5ecae6ed4e..2b8f11a925 100644 --- a/web/app/components/app/configuration/debug/index.tsx +++ b/web/app/components/app/configuration/debug/index.tsx @@ -38,6 +38,7 @@ const Debug: FC = ({ mode, introduction, suggestedQuestionsAfterAnswerConfig, + speechToTextConfig, moreLikeThisConfig, inputs, // setInputs, @@ -159,6 +160,7 @@ const Debug: FC = ({ enabled: false, }, suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig, + speech_to_text: speechToTextConfig, agent_mode: { enabled: true, tools: [...postDatasets], @@ -308,6 +310,7 @@ const Debug: FC = ({ user_input_form: promptVariablesToUserInputsForm(modelConfig.configs.prompt_variables), opening_statement: introduction, suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig, + speech_to_text: speechToTextConfig, more_like_this: moreLikeThisConfig, agent_mode: { enabled: true, @@ -386,6 +389,7 @@ const Debug: FC = ({ }} isShowSuggestion={doShowSuggestion} suggestionList={suggestQuestions} + isShowSpeechToText={speechToTextConfig.enabled} /> diff --git a/web/app/components/app/configuration/features/chat-group/index.tsx b/web/app/components/app/configuration/features/chat-group/index.tsx index 774137edb1..483ac85706 100644 --- a/web/app/components/app/configuration/features/chat-group/index.tsx +++ b/web/app/components/app/configuration/features/chat-group/index.tsx @@ -1,25 +1,30 @@ 'use client' -import React, { FC } from 'react' -import GroupName from '../../base/group-name' -import OpeningStatement, { IOpeningStatementProps } from './opening-statement' -import SuggestedQuestionsAfterAnswer from './suggested-questions-after-answer' +import type { FC } from 'react' +import React from 'react' import { useTranslation } from 'react-i18next' +import GroupName from '../../base/group-name' +import type { IOpeningStatementProps } from './opening-statement' +import OpeningStatement from './opening-statement' +import SuggestedQuestionsAfterAnswer from './suggested-questions-after-answer' +import SpeechToText from './speech-to-text' /* -* Include +* Include * 1. Conversation Opener * 2. Opening Suggestion * 3. Next question suggestion */ -interface ChatGroupProps { +type ChatGroupProps = { isShowOpeningStatement: boolean openingStatementConfig: IOpeningStatementProps isShowSuggestedQuestionsAfterAnswer: boolean + isShowSpeechText: boolean } const ChatGroup: FC = ({ isShowOpeningStatement, openingStatementConfig, - isShowSuggestedQuestionsAfterAnswer + isShowSuggestedQuestionsAfterAnswer, + isShowSpeechText, }) => { const { t } = useTranslation() @@ -33,6 +38,11 @@ const ChatGroup: FC = ({ {isShowSuggestedQuestionsAfterAnswer && ( )} + { + isShowSpeechText && ( + + ) + } ) diff --git a/web/app/components/app/configuration/features/chat-group/speech-to-text/index.tsx b/web/app/components/app/configuration/features/chat-group/speech-to-text/index.tsx new file mode 100644 index 0000000000..913765bce3 --- /dev/null +++ b/web/app/components/app/configuration/features/chat-group/speech-to-text/index.tsx @@ -0,0 +1,25 @@ +'use client' +import React, { type FC } from 'react' +import { useTranslation } from 'react-i18next' +import Panel from '@/app/components/app/configuration/base/feature-panel' +import { Microphone01 } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices' + +const SuggestedQuestionsAfterAnswer: FC = () => { + const { t } = useTranslation() + + return ( + +
{t('appDebug.feature.speechToText.title')}
+ + } + headerIcon={} + headerRight={ +
{t('appDebug.feature.speechToText.resDes')}
+ } + noBodySpacing + /> + ) +} +export default React.memo(SuggestedQuestionsAfterAnswer) diff --git a/web/app/components/app/configuration/index.tsx b/web/app/components/app/configuration/index.tsx index 34af8d8443..be878ec2b8 100644 --- a/web/app/components/app/configuration/index.tsx +++ b/web/app/components/app/configuration/index.tsx @@ -53,6 +53,9 @@ const Configuration: FC = () => { const [suggestedQuestionsAfterAnswerConfig, setSuggestedQuestionsAfterAnswerConfig] = useState({ enabled: false, }) + const [speechToTextConfig, setSpeechToTextConfig] = useState({ + enabled: false, + }) const [formattingChanged, setFormattingChanged] = useState(false) const [inputs, setInputs] = useState({}) const [query, setQuery] = useState('') @@ -73,6 +76,7 @@ const Configuration: FC = () => { opening_statement: '', more_like_this: null, suggested_questions_after_answer: null, + speech_to_text: null, dataSets: [], }) @@ -102,6 +106,9 @@ const Configuration: FC = () => { setSuggestedQuestionsAfterAnswerConfig(modelConfig.suggested_questions_after_answer || { enabled: false, }) + setSpeechToTextConfig(modelConfig.speech_to_text || { + enabled: false, + }) } const [hasSetCustomAPIKEY, setHasSetCustomerAPIKEY] = useState(true) @@ -146,6 +153,9 @@ const Configuration: FC = () => { if (modelConfig.suggested_questions_after_answer) setSuggestedQuestionsAfterAnswerConfig(modelConfig.suggested_questions_after_answer) + if (modelConfig.speech_to_text) + setSpeechToTextConfig(modelConfig.speech_to_text) + const config = { modelConfig: { provider: model.provider, @@ -157,6 +167,7 @@ const Configuration: FC = () => { opening_statement: modelConfig.opening_statement, more_like_this: modelConfig.more_like_this, suggested_questions_after_answer: modelConfig.suggested_questions_after_answer, + speech_to_text: modelConfig.speech_to_text, dataSets: datasets || [], }, completionParams: model.completion_params, @@ -187,6 +198,7 @@ const Configuration: FC = () => { opening_statement: introduction || '', more_like_this: moreLikeThisConfig, suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig, + speech_to_text: speechToTextConfig, agent_mode: { enabled: true, tools: [...postDatasets], @@ -203,6 +215,7 @@ const Configuration: FC = () => { draft.opening_statement = introduction draft.more_like_this = moreLikeThisConfig draft.suggested_questions_after_answer = suggestedQuestionsAfterAnswerConfig + draft.speech_to_text = speechToTextConfig draft.dataSets = dataSets }) setPublishedConfig({ @@ -245,6 +258,8 @@ const Configuration: FC = () => { setMoreLikeThisConfig, suggestedQuestionsAfterAnswerConfig, setSuggestedQuestionsAfterAnswerConfig, + speechToTextConfig, + setSpeechToTextConfig, formattingChanged, setFormattingChanged, inputs, diff --git a/web/app/components/base/icons/assets/vender/line/general/loading-02.svg b/web/app/components/base/icons/assets/vender/line/general/loading-02.svg new file mode 100644 index 0000000000..c08ddd4bf4 --- /dev/null +++ b/web/app/components/base/icons/assets/vender/line/general/loading-02.svg @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/web/app/components/base/icons/assets/vender/line/general/x-close.svg b/web/app/components/base/icons/assets/vender/line/general/x-close.svg new file mode 100644 index 0000000000..0ba4fd44f3 --- /dev/null +++ b/web/app/components/base/icons/assets/vender/line/general/x-close.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/web/app/components/base/icons/assets/vender/line/mediaAndDevices/microphone-01.svg b/web/app/components/base/icons/assets/vender/line/mediaAndDevices/microphone-01.svg new file mode 100644 index 0000000000..46d6818c8d --- /dev/null +++ b/web/app/components/base/icons/assets/vender/line/mediaAndDevices/microphone-01.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/web/app/components/base/icons/assets/vender/solid/general/x-circle.svg b/web/app/components/base/icons/assets/vender/solid/general/x-circle.svg new file mode 100644 index 0000000000..5acbe5f562 --- /dev/null +++ b/web/app/components/base/icons/assets/vender/solid/general/x-circle.svg @@ -0,0 +1,3 @@ + + + diff --git a/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/microphone-01.svg b/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/microphone-01.svg new file mode 100644 index 0000000000..8727719fba --- /dev/null +++ b/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/microphone-01.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/stop-circle.svg b/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/stop-circle.svg new file mode 100644 index 0000000000..3009a52e90 --- /dev/null +++ b/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/stop-circle.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/web/app/components/base/icons/src/vender/line/general/Loading02.json b/web/app/components/base/icons/src/vender/line/general/Loading02.json new file mode 100644 index 0000000000..1711fba5dd --- /dev/null +++ b/web/app/components/base/icons/src/vender/line/general/Loading02.json @@ -0,0 +1,64 @@ +{ + "icon": { + "type": "element", + "isRootNode": true, + "name": "svg", + "attributes": { + "width": "16", + "height": "16", + "viewBox": "0 0 16 16", + "fill": "none", + "xmlns": "http://www.w3.org/2000/svg" + }, + "children": [ + { + "type": "element", + "name": "g", + "attributes": { + "clip-path": "url(#clip0_6037_51601)" + }, + "children": [ + { + "type": "element", + "name": "path", + "attributes": { + "d": "M7.99992 1.33398V4.00065M7.99992 12.0007V14.6673M3.99992 8.00065H1.33325M14.6666 8.00065H11.9999M12.7189 12.7196L10.8333 10.834M12.7189 3.33395L10.8333 5.21956M3.28097 12.7196L5.16659 10.834M3.28097 3.33395L5.16659 5.21956", + "stroke": "currentColor", + "stroke-width": "1.25", + "stroke-linecap": "round", + "stroke-linejoin": "round" + }, + "children": [] + } + ] + }, + { + "type": "element", + "name": "defs", + "attributes": {}, + "children": [ + { + "type": "element", + "name": "clipPath", + "attributes": { + "id": "clip0_6037_51601" + }, + "children": [ + { + "type": "element", + "name": "rect", + "attributes": { + "width": "16", + "height": "16", + "fill": "white" + }, + "children": [] + } + ] + } + ] + } + ] + }, + "name": "Loading02" +} \ No newline at end of file diff --git a/web/app/components/base/icons/src/vender/line/general/Loading02.tsx b/web/app/components/base/icons/src/vender/line/general/Loading02.tsx new file mode 100644 index 0000000000..4441f7f47b --- /dev/null +++ b/web/app/components/base/icons/src/vender/line/general/Loading02.tsx @@ -0,0 +1,14 @@ +// GENERATE BY script +// DON NOT EDIT IT MANUALLY + +import * as React from 'react' +import data from './Loading02.json' +import IconBase from '@/app/components/base/icons/IconBase' +import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase' + +const Icon = React.forwardRef, Omit>(( + props, + ref, +) => ) + +export default Icon diff --git a/web/app/components/base/icons/src/vender/line/general/XClose.json b/web/app/components/base/icons/src/vender/line/general/XClose.json new file mode 100644 index 0000000000..6fef683513 --- /dev/null +++ b/web/app/components/base/icons/src/vender/line/general/XClose.json @@ -0,0 +1,39 @@ +{ + "icon": { + "type": "element", + "isRootNode": true, + "name": "svg", + "attributes": { + "width": "16", + "height": "16", + "viewBox": "0 0 16 16", + "fill": "none", + "xmlns": "http://www.w3.org/2000/svg" + }, + "children": [ + { + "type": "element", + "name": "g", + "attributes": { + "id": "x-close" + }, + "children": [ + { + "type": "element", + "name": "path", + "attributes": { + "id": "Icon", + "d": "M12 4L4 12M4 4L12 12", + "stroke": "currentColor", + "stroke-width": "1.25", + "stroke-linecap": "round", + "stroke-linejoin": "round" + }, + "children": [] + } + ] + } + ] + }, + "name": "XClose" +} \ No newline at end of file diff --git a/web/app/components/base/icons/src/vender/line/general/XClose.tsx b/web/app/components/base/icons/src/vender/line/general/XClose.tsx new file mode 100644 index 0000000000..72ac3105df --- /dev/null +++ b/web/app/components/base/icons/src/vender/line/general/XClose.tsx @@ -0,0 +1,14 @@ +// GENERATE BY script +// DON NOT EDIT IT MANUALLY + +import * as React from 'react' +import data from './XClose.json' +import IconBase from '@/app/components/base/icons/IconBase' +import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase' + +const Icon = React.forwardRef, Omit>(( + props, + ref, +) => ) + +export default Icon diff --git a/web/app/components/base/icons/src/vender/line/general/index.ts b/web/app/components/base/icons/src/vender/line/general/index.ts index 1bd5afa467..cc5e845752 100644 --- a/web/app/components/base/icons/src/vender/line/general/index.ts +++ b/web/app/components/base/icons/src/vender/line/general/index.ts @@ -1,2 +1,4 @@ +export { default as Loading02 } from './Loading02' export { default as Trash03 } from './Trash03' +export { default as XClose } from './XClose' export { default as X } from './X' diff --git a/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.json b/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.json new file mode 100644 index 0000000000..8f273d0a75 --- /dev/null +++ b/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.json @@ -0,0 +1,39 @@ +{ + "icon": { + "type": "element", + "isRootNode": true, + "name": "svg", + "attributes": { + "width": "16", + "height": "16", + "viewBox": "0 0 16 16", + "fill": "none", + "xmlns": "http://www.w3.org/2000/svg" + }, + "children": [ + { + "type": "element", + "name": "g", + "attributes": { + "id": "microphone-01" + }, + "children": [ + { + "type": "element", + "name": "path", + "attributes": { + "id": "Icon", + "d": "M12.6666 6.66732V8.00065C12.6666 10.578 10.5772 12.6673 7.99992 12.6673M3.33325 6.66732V8.00065C3.33325 10.578 5.42259 12.6673 7.99992 12.6673M7.99992 12.6673V14.6673M5.33325 14.6673H10.6666M7.99992 10.0007C6.89535 10.0007 5.99992 9.10522 5.99992 8.00065V3.33398C5.99992 2.22941 6.89535 1.33398 7.99992 1.33398C9.10449 1.33398 9.99992 2.22941 9.99992 3.33398V8.00065C9.99992 9.10522 9.10449 10.0007 7.99992 10.0007Z", + "stroke": "currentColor", + "stroke-width": "1.25", + "stroke-linecap": "round", + "stroke-linejoin": "round" + }, + "children": [] + } + ] + } + ] + }, + "name": "Microphone01" +} \ No newline at end of file diff --git a/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.tsx b/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.tsx new file mode 100644 index 0000000000..f7722df092 --- /dev/null +++ b/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.tsx @@ -0,0 +1,14 @@ +// GENERATE BY script +// DON NOT EDIT IT MANUALLY + +import * as React from 'react' +import data from './Microphone01.json' +import IconBase from '@/app/components/base/icons/IconBase' +import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase' + +const Icon = React.forwardRef, Omit>(( + props, + ref, +) => ) + +export default Icon diff --git a/web/app/components/base/icons/src/vender/line/mediaAndDevices/index.ts b/web/app/components/base/icons/src/vender/line/mediaAndDevices/index.ts new file mode 100644 index 0000000000..6a5d1ff945 --- /dev/null +++ b/web/app/components/base/icons/src/vender/line/mediaAndDevices/index.ts @@ -0,0 +1 @@ +export { default as Microphone01 } from './Microphone01' diff --git a/web/app/components/base/icons/src/vender/solid/general/XCircle.json b/web/app/components/base/icons/src/vender/solid/general/XCircle.json new file mode 100644 index 0000000000..dd269fafcc --- /dev/null +++ b/web/app/components/base/icons/src/vender/solid/general/XCircle.json @@ -0,0 +1,29 @@ +{ + "icon": { + "type": "element", + "isRootNode": true, + "name": "svg", + "attributes": { + "width": "16", + "height": "16", + "viewBox": "0 0 16 16", + "fill": "none", + "xmlns": "http://www.w3.org/2000/svg" + }, + "children": [ + { + "type": "element", + "name": "path", + "attributes": { + "id": "Solid", + "fill-rule": "evenodd", + "clip-rule": "evenodd", + "d": "M8.00008 0.666016C3.94999 0.666016 0.666748 3.94926 0.666748 7.99935C0.666748 12.0494 3.94999 15.3327 8.00008 15.3327C12.0502 15.3327 15.3334 12.0494 15.3334 7.99935C15.3334 3.94926 12.0502 0.666016 8.00008 0.666016ZM10.4715 5.52794C10.7318 5.78829 10.7318 6.2104 10.4715 6.47075L8.94289 7.99935L10.4715 9.52794C10.7318 9.78829 10.7318 10.2104 10.4715 10.4708C10.2111 10.7311 9.78903 10.7311 9.52868 10.4708L8.00008 8.94216L6.47149 10.4708C6.21114 10.7311 5.78903 10.7311 5.52868 10.4708C5.26833 10.2104 5.26833 9.78829 5.52868 9.52794L7.05727 7.99935L5.52868 6.47075C5.26833 6.2104 5.26833 5.78829 5.52868 5.52794C5.78903 5.26759 6.21114 5.26759 6.47149 5.52794L8.00008 7.05654L9.52868 5.52794C9.78903 5.26759 10.2111 5.26759 10.4715 5.52794Z", + "fill": "currentColor" + }, + "children": [] + } + ] + }, + "name": "XCircle" +} \ No newline at end of file diff --git a/web/app/components/base/icons/src/vender/solid/general/XCircle.tsx b/web/app/components/base/icons/src/vender/solid/general/XCircle.tsx new file mode 100644 index 0000000000..bd9ab27be1 --- /dev/null +++ b/web/app/components/base/icons/src/vender/solid/general/XCircle.tsx @@ -0,0 +1,14 @@ +// GENERATE BY script +// DON NOT EDIT IT MANUALLY + +import * as React from 'react' +import data from './XCircle.json' +import IconBase from '@/app/components/base/icons/IconBase' +import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase' + +const Icon = React.forwardRef, Omit>(( + props, + ref, +) => ) + +export default Icon diff --git a/web/app/components/base/icons/src/vender/solid/general/index.ts b/web/app/components/base/icons/src/vender/solid/general/index.ts index d056eef8af..fda0a484e2 100644 --- a/web/app/components/base/icons/src/vender/solid/general/index.ts +++ b/web/app/components/base/icons/src/vender/solid/general/index.ts @@ -1 +1,2 @@ export { default as Download02 } from './Download02' +export { default as XCircle } from './XCircle' diff --git a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.json b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.json new file mode 100644 index 0000000000..36aad43649 --- /dev/null +++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.json @@ -0,0 +1,55 @@ +{ + "icon": { + "type": "element", + "isRootNode": true, + "name": "svg", + "attributes": { + "width": "16", + "height": "16", + "viewBox": "0 0 16 16", + "fill": "none", + "xmlns": "http://www.w3.org/2000/svg" + }, + "children": [ + { + "type": "element", + "name": "g", + "attributes": { + "id": "microphone-01" + }, + "children": [ + { + "type": "element", + "name": "g", + "attributes": { + "id": "Solid" + }, + "children": [ + { + "type": "element", + "name": "path", + "attributes": { + "fill-rule": "evenodd", + "clip-rule": "evenodd", + "d": "M8.00008 0.666016C6.52732 0.666016 5.33341 1.85992 5.33341 3.33268V7.99935C5.33341 9.47211 6.52732 10.666 8.00008 10.666C9.47284 10.666 10.6667 9.47211 10.6667 7.99935V3.33268C10.6667 1.85992 9.47284 0.666016 8.00008 0.666016Z", + "fill": "currentColor" + }, + "children": [] + }, + { + "type": "element", + "name": "path", + "attributes": { + "d": "M4.00008 6.66602C4.00008 6.29783 3.7016 5.99935 3.33341 5.99935C2.96522 5.99935 2.66675 6.29783 2.66675 6.66602V7.99935C2.66675 10.7195 4.70319 12.9641 7.33466 13.2916C7.33384 13.3052 7.33341 13.3189 7.33341 13.3327V13.9993H5.33341C4.96522 13.9993 4.66675 14.2978 4.66675 14.666C4.66675 15.0342 4.96522 15.3327 5.33341 15.3327H10.6667C11.0349 15.3327 11.3334 15.0342 11.3334 14.666C11.3334 14.2978 11.0349 13.9993 10.6667 13.9993H8.66675V13.3327C8.66675 13.3189 8.66633 13.3052 8.6655 13.2916C11.297 12.9641 13.3334 10.7195 13.3334 7.99935V6.66602C13.3334 6.29783 13.0349 5.99935 12.6667 5.99935C12.2986 5.99935 12.0001 6.29783 12.0001 6.66602V7.99935C12.0001 10.2085 10.2092 11.9993 8.00008 11.9993C5.79094 11.9993 4.00008 10.2085 4.00008 7.99935V6.66602Z", + "fill": "currentColor" + }, + "children": [] + } + ] + } + ] + } + ] + }, + "name": "Microphone01" +} \ No newline at end of file diff --git a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.tsx b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.tsx new file mode 100644 index 0000000000..f7722df092 --- /dev/null +++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.tsx @@ -0,0 +1,14 @@ +// GENERATE BY script +// DON NOT EDIT IT MANUALLY + +import * as React from 'react' +import data from './Microphone01.json' +import IconBase from '@/app/components/base/icons/IconBase' +import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase' + +const Icon = React.forwardRef, Omit>(( + props, + ref, +) => ) + +export default Icon diff --git a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.json b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.json new file mode 100644 index 0000000000..67e02fca63 --- /dev/null +++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.json @@ -0,0 +1,38 @@ +{ + "icon": { + "type": "element", + "isRootNode": true, + "name": "svg", + "attributes": { + "width": "20", + "height": "20", + "viewBox": "0 0 20 20", + "fill": "none", + "xmlns": "http://www.w3.org/2000/svg" + }, + "children": [ + { + "type": "element", + "name": "g", + "attributes": { + "id": "stop-circle" + }, + "children": [ + { + "type": "element", + "name": "path", + "attributes": { + "id": "Solid", + "fill-rule": "evenodd", + "clip-rule": "evenodd", + "d": "M9.99992 0.833984C4.93731 0.833984 0.833252 4.93804 0.833252 10.0007C0.833252 15.0633 4.93731 19.1673 9.99992 19.1673C15.0625 19.1673 19.1666 15.0633 19.1666 10.0007C19.1666 4.93804 15.0625 0.833984 9.99992 0.833984ZM6.75741 7.12232C6.66658 7.30058 6.66658 7.53394 6.66658 8.00065V12.0006C6.66658 12.4674 6.66658 12.7007 6.75741 12.879C6.83731 13.0358 6.96479 13.1633 7.12159 13.2432C7.29985 13.334 7.53321 13.334 7.99992 13.334H11.9999C12.4666 13.334 12.7 13.334 12.8782 13.2432C13.035 13.1633 13.1625 13.0358 13.2424 12.879C13.3333 12.7007 13.3333 12.4674 13.3333 12.0006V8.00065C13.3333 7.53394 13.3333 7.30058 13.2424 7.12232C13.1625 6.96552 13.035 6.83804 12.8782 6.75814C12.7 6.66732 12.4666 6.66732 11.9999 6.66732H7.99992C7.53321 6.66732 7.29985 6.66732 7.12159 6.75814C6.96479 6.83804 6.83731 6.96552 6.75741 7.12232Z", + "fill": "currentColor" + }, + "children": [] + } + ] + } + ] + }, + "name": "StopCircle" +} \ No newline at end of file diff --git a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.tsx b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.tsx new file mode 100644 index 0000000000..488ae5c5f2 --- /dev/null +++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.tsx @@ -0,0 +1,14 @@ +// GENERATE BY script +// DON NOT EDIT IT MANUALLY + +import * as React from 'react' +import data from './StopCircle.json' +import IconBase from '@/app/components/base/icons/IconBase' +import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase' + +const Icon = React.forwardRef, Omit>(( + props, + ref, +) => ) + +export default Icon diff --git a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/index.ts b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/index.ts new file mode 100644 index 0000000000..3fdb556763 --- /dev/null +++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/index.ts @@ -0,0 +1,2 @@ +export { default as Microphone01 } from './Microphone01' +export { default as StopCircle } from './StopCircle' diff --git a/web/app/components/base/voice-input/index.module.css b/web/app/components/base/voice-input/index.module.css new file mode 100644 index 0000000000..18d51da525 --- /dev/null +++ b/web/app/components/base/voice-input/index.module.css @@ -0,0 +1,10 @@ +.wrapper { + background: linear-gradient(131deg, #2250F2 0%, #0EBCF3 100%); + box-shadow: 0px 4px 6px -2px rgba(16, 24, 40, 0.03), 0px 12px 16px -4px rgba(16, 24, 40, 0.08); +} + +.convert { + background: linear-gradient(91.92deg, #104AE1 -1.74%, #0098EE 75.74%); + background-clip: text; + color: transparent; +} \ No newline at end of file diff --git a/web/app/components/base/voice-input/index.tsx b/web/app/components/base/voice-input/index.tsx new file mode 100644 index 0000000000..19d9c63ae1 --- /dev/null +++ b/web/app/components/base/voice-input/index.tsx @@ -0,0 +1,197 @@ +import { useCallback, useEffect, useRef, useState } from 'react' +import { useTranslation } from 'react-i18next' +import { useParams, usePathname } from 'next/navigation' +import cn from 'classnames' +import Recorder from 'js-audio-recorder' +import { useRafInterval } from 'ahooks' +import s from './index.module.css' +import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices' +import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general' +import { audioToText } from '@/service/share' + +type VoiceInputTypes = { + onConverted: (text: string) => void + onCancel: () => void +} + +const VoiceInput = ({ + onCancel, + onConverted, +}: VoiceInputTypes) => { + const { t } = useTranslation() + const recorder = useRef(new Recorder()) + const canvasRef = useRef(null) + const ctxRef = useRef(null) + const drawRecordId = useRef(null) + const [originDuration, setOriginDuration] = useState(0) + const [startRecord, setStartRecord] = useState(false) + const [startConvert, setStartConvert] = useState(false) + const pathname = usePathname() + const params = useParams() + const clearInterval = useRafInterval(() => { + setOriginDuration(originDuration + 1) + }, 1000) + + const drawRecord = useCallback(() => { + drawRecordId.current = requestAnimationFrame(drawRecord) + const canvas = canvasRef.current! + const ctx = ctxRef.current! + const dataUnit8Array = recorder.current.getRecordAnalyseData() + const dataArray = [].slice.call(dataUnit8Array) + const lineLength = parseInt(`${canvas.width / 3}`) + const gap = parseInt(`${1024 / lineLength}`) + + ctx.clearRect(0, 0, canvas.width, canvas.height) + ctx.beginPath() + let x = 0 + for (let i = 0; i < lineLength; i++) { + let v = dataArray.slice(i * gap, i * gap + gap).reduce((prev: number, next: number) => { + return prev + next + }, 0) / gap + + if (v < 128) + v = 128 + if (v > 178) + v = 178 + const y = (v - 128) / 50 * canvas.height + + ctx.moveTo(x, 16) + ctx.roundRect(x, 16 - y, 2, y, [1, 1, 0, 0]) + ctx.fill() + x += 3 + } + ctx.closePath() + }, []) + const handleStopRecorder = useCallback(async () => { + clearInterval() + setStartRecord(false) + setStartConvert(true) + recorder.current.stop() + drawRecordId.current && cancelAnimationFrame(drawRecordId.current) + drawRecordId.current = null + const canvas = canvasRef.current! + const ctx = ctxRef.current! + ctx.clearRect(0, 0, canvas.width, canvas.height) + const wavBlob = recorder.current.getWAVBlob() + const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' }) + const formData = new FormData() + formData.append('file', wavFile) + + let url = '' + let isPublic = false + + if (params.token) { + url = '/audio-to-text' + isPublic = true + } + else if (params.appId) { + if (pathname.search('explore/installed') > -1) + url = `/installed-apps/${params.appId}/audio-to-text` + else + url = `/apps/${params.appId}/audio-to-text` + } + + try { + const audioResponse = await audioToText(url, isPublic, formData) + onConverted(audioResponse.text) + onCancel() + } + catch (e) { + onConverted('') + onCancel() + } + }, []) + const handleStartRecord = async () => { + try { + await recorder.current.start() + setStartRecord(true) + setStartConvert(false) + + if (canvasRef.current && ctxRef.current) + drawRecord() + } + catch (e) { + onCancel() + } + } + + const initCanvas = () => { + const dpr = window.devicePixelRatio || 1 + const canvas = document.getElementById('voice-input-record') as HTMLCanvasElement + + if (canvas) { + const { width: cssWidth, height: cssHeight } = canvas.getBoundingClientRect() + + canvas.width = dpr * cssWidth + canvas.height = dpr * cssHeight + canvasRef.current = canvas + + const ctx = canvas.getContext('2d') + if (ctx) { + ctx.scale(dpr, dpr) + ctx.fillStyle = 'rgba(209, 224, 255, 1)' + ctxRef.current = ctx + } + } + } + if (originDuration >= 120 && startRecord) + handleStopRecorder() + + useEffect(() => { + initCanvas() + handleStartRecord() + }, []) + + const minutes = parseInt(`${parseInt(`${originDuration}`) / 60}`) + const seconds = parseInt(`${originDuration}`) % 60 + + return ( +
+
+ + { + startConvert && + } +
+ { + startRecord && ( +
+ {t('common.voiceInput.speaking')} +
+ ) + } + { + startConvert && ( +
+ {t('common.voiceInput.converting')} +
+ ) + } +
+ { + startRecord && ( +
+ +
+ ) + } + { + startConvert && ( +
+ +
+ ) + } +
110 ? 'text-[#F04438]' : 'text-gray-700'}`}>{`0${minutes.toFixed(0)}:${seconds >= 10 ? seconds : `0${seconds}`}`}
+
+
+ ) +} + +export default VoiceInput diff --git a/web/app/components/explore/installed-app/index.tsx b/web/app/components/explore/installed-app/index.tsx index 36460cc223..b9071127ee 100644 --- a/web/app/components/explore/installed-app/index.tsx +++ b/web/app/components/explore/installed-app/index.tsx @@ -29,7 +29,7 @@ const InstalledApp: FC = ({
{installedApp?.app.mode === 'chat' ? ( - + ) : ( diff --git a/web/app/components/share/chat/index.tsx b/web/app/components/share/chat/index.tsx index ebe6ab0967..6b81454772 100644 --- a/web/app/components/share/chat/index.tsx +++ b/web/app/components/share/chat/index.tsx @@ -149,6 +149,7 @@ const Main: FC = ({ } const [suggestedQuestionsAfterAnswerConfig, setSuggestedQuestionsAfterAnswerConfig] = useState(null) + const [speechToTextConfig, setSpeechToTextConfig] = useState(null) const [conversationIdChangeBecauseOfNew, setConversationIdChangeBecauseOfNew, getConversationIdChangeBecauseOfNew] = useGetState(false) const [isChatStarted, { setTrue: setChatStarted, setFalse: setChatNotStarted }] = useBoolean(false) @@ -326,7 +327,7 @@ const Main: FC = ({ const isNotNewConversation = allConversations.some(item => item.id === _conversationId) setAllConversationList(allConversations) // fetch new conversation info - const { user_input_form, opening_statement: introduction, suggested_questions_after_answer }: any = appParams + const { user_input_form, opening_statement: introduction, suggested_questions_after_answer, speech_to_text }: any = appParams const prompt_variables = userInputsFormToPromptVariables(user_input_form) if (siteInfo.default_language) changeLanguage(siteInfo.default_language) @@ -341,6 +342,7 @@ const Main: FC = ({ prompt_variables, } as PromptConfig) setSuggestedQuestionsAfterAnswerConfig(suggested_questions_after_answer) + setSpeechToTextConfig(speech_to_text) // setConversationList(conversations as ConversationItem[]) @@ -620,6 +622,7 @@ const Main: FC = ({ controlFocus={controlFocus} isShowSuggestion={doShowSuggestion} suggestionList={suggestQuestions} + isShowSpeechToText={speechToTextConfig?.enabled} />
) diff --git a/web/context/debug-configuration.ts b/web/context/debug-configuration.ts index 6c238fe4ce..7351d8e284 100644 --- a/web/context/debug-configuration.ts +++ b/web/context/debug-configuration.ts @@ -1,5 +1,5 @@ import { createContext } from 'use-context-selector' -import type { CompletionParams, Inputs, ModelConfig, MoreLikeThisConfig, PromptConfig, SuggestedQuestionsAfterAnswerConfig } from '@/models/debug' +import type { CompletionParams, Inputs, ModelConfig, MoreLikeThisConfig, PromptConfig, SpeechToTextConfig, SuggestedQuestionsAfterAnswerConfig } from '@/models/debug' import type { DataSet } from '@/models/datasets' type IDebugConfiguration = { @@ -19,6 +19,8 @@ type IDebugConfiguration = { setMoreLikeThisConfig: (moreLikeThisConfig: MoreLikeThisConfig) => void suggestedQuestionsAfterAnswerConfig: SuggestedQuestionsAfterAnswerConfig setSuggestedQuestionsAfterAnswerConfig: (suggestedQuestionsAfterAnswerConfig: SuggestedQuestionsAfterAnswerConfig) => void + speechToTextConfig: SpeechToTextConfig + setSpeechToTextConfig: (speechToTextConfig: SpeechToTextConfig) => void formattingChanged: boolean setFormattingChanged: (formattingChanged: boolean) => void inputs: Inputs @@ -59,6 +61,10 @@ const DebugConfigurationContext = createContext({ enabled: false, }, setSuggestedQuestionsAfterAnswerConfig: () => { }, + speechToTextConfig: { + enabled: false, + }, + setSpeechToTextConfig: () => { }, formattingChanged: false, setFormattingChanged: () => { }, inputs: {}, diff --git a/web/i18n/lang/app-debug.en.ts b/web/i18n/lang/app-debug.en.ts index f5df2b390a..6cd509db28 100644 --- a/web/i18n/lang/app-debug.en.ts +++ b/web/i18n/lang/app-debug.en.ts @@ -46,6 +46,11 @@ const translation = { generateNumTip: 'Number of each generated times', tip: 'Using this feature will incur additional tokens overhead', }, + speechToText: { + title: 'Speech to Text', + description: 'Once enabled, you can use voice input.', + resDes: 'Voice input is enabled', + }, dataSet: { title: 'Context', noData: 'You can import datasets as context', diff --git a/web/i18n/lang/app-debug.zh.ts b/web/i18n/lang/app-debug.zh.ts index 7b8f725ae0..1a813c03a0 100644 --- a/web/i18n/lang/app-debug.zh.ts +++ b/web/i18n/lang/app-debug.zh.ts @@ -46,6 +46,11 @@ const translation = { generateNumTip: '每次生成数', tip: '使用此功能将会额外消耗 tokens', }, + speechToText: { + title: '语音转文字', + description: '启用后,您可以使用语音输入。', + resDes: '语音输入已启用', + }, dataSet: { title: '上下文', noData: '您可以导入数据集作为上下文', diff --git a/web/i18n/lang/common.en.ts b/web/i18n/lang/common.en.ts index cdb3fabc97..059aad438c 100644 --- a/web/i18n/lang/common.en.ts +++ b/web/i18n/lang/common.en.ts @@ -225,6 +225,11 @@ const translation = { viewDoc: 'View documentation', relatedApp: 'linked apps', }, + voiceInput: { + speaking: 'Speak now...', + converting: 'Converting to text...', + notAllow: 'microphone not authorized', + }, } export default translation diff --git a/web/i18n/lang/common.zh.ts b/web/i18n/lang/common.zh.ts index a6d17955a4..529f2516c8 100644 --- a/web/i18n/lang/common.zh.ts +++ b/web/i18n/lang/common.zh.ts @@ -226,6 +226,11 @@ const translation = { viewDoc: '查看文档', relatedApp: '个关联应用', }, + voiceInput: { + speaking: '现在讲...', + converting: '正在转换为文本...', + notAllow: '麦克风未授权', + }, } export default translation diff --git a/web/models/debug.ts b/web/models/debug.ts index cfc7f9f803..ef28dcb1ae 100644 --- a/web/models/debug.ts +++ b/web/models/debug.ts @@ -31,6 +31,8 @@ export type MoreLikeThisConfig = { export type SuggestedQuestionsAfterAnswerConfig = MoreLikeThisConfig +export type SpeechToTextConfig = MoreLikeThisConfig + // frontend use. Not the same as backend export type ModelConfig = { provider: string // LLM Provider: for example "OPENAI" @@ -43,6 +45,9 @@ export type ModelConfig = { suggested_questions_after_answer: { enabled: boolean } | null + speech_to_text: { + enabled: boolean + } | null dataSets: any[] } diff --git a/web/package.json b/web/package.json index f0fdf8d665..23ec5b3e4b 100644 --- a/web/package.json +++ b/web/package.json @@ -48,6 +48,7 @@ "i18next": "^22.4.13", "i18next-resources-to-backend": "^1.1.3", "immer": "^9.0.19", + "js-audio-recorder": "^1.0.7", "js-cookie": "^3.0.1", "katex": "^0.16.7", "lodash-es": "^4.17.21", @@ -68,6 +69,7 @@ "react-tooltip": "5.8.3", "react-window": "^1.8.9", "react-window-infinite-loader": "^1.0.9", + "recordrtc": "^5.6.2", "rehype-katex": "^6.0.2", "remark-breaks": "^3.0.2", "remark-gfm": "^3.0.1", @@ -88,6 +90,7 @@ "@types/js-cookie": "^3.0.3", "@types/negotiator": "^0.6.1", "@types/qs": "^6.9.7", + "@types/recordrtc": "^5.6.11", "@types/sortablejs": "^1.15.1", "eslint-config-next": "^13.4.7", "eslint-plugin-react-hooks": "^4.6.0", diff --git a/web/service/base.ts b/web/service/base.ts index a6ea0c9981..dfdfae5331 100644 --- a/web/service/base.ts +++ b/web/service/base.ts @@ -35,7 +35,9 @@ export type IOnError = (msg: string) => void type IOtherOptions = { isPublicAPI?: boolean + bodyStringify?: boolean needAllResponseContent?: boolean + deleteContentType?: boolean onData?: IOnData // for stream onError?: IOnError onCompleted?: IOnCompleted // for stream @@ -132,7 +134,9 @@ const baseFetch = ( fetchOptions: any, { isPublicAPI = false, + bodyStringify = true, needAllResponseContent, + deleteContentType, }: IOtherOptions, ) => { const options = Object.assign({}, baseOptions, fetchOptions) @@ -141,6 +145,15 @@ const baseFetch = ( options.headers.set('Authorization', `bearer ${sharedToken}`) } + if (deleteContentType) { + options.headers.delete('Content-Type') + } + else { + const contentType = options.headers.get('Content-Type') + if (!contentType) + options.headers.set('Content-Type', ContentType.json) + } + const urlPrefix = isPublicAPI ? PUBLIC_API_PREFIX : API_PREFIX let urlWithPrefix = `${urlPrefix}${url.startsWith('/') ? url : `/${url}`}` @@ -160,7 +173,7 @@ const baseFetch = ( delete options.params } - if (body) + if (body && bodyStringify) options.body = JSON.stringify(body) // Handle timeout @@ -285,6 +298,10 @@ export const ssePost = (url: string, fetchOptions: any, { isPublicAPI = false, o signal: abortController.signal, }, fetchOptions) + const contentType = options.headers.get('Content-Type') + if (!contentType) + options.headers.set('Content-Type', ContentType.json) + getAbortController?.(abortController) const urlPrefix = isPublicAPI ? PUBLIC_API_PREFIX : API_PREFIX diff --git a/web/service/share.ts b/web/service/share.ts index 01abfd9a1a..df01abc3eb 100644 --- a/web/service/share.ts +++ b/web/service/share.ts @@ -114,3 +114,7 @@ export const removeMessage = (messageId: string, isInstalledApp: boolean, instal export const fetchSuggestedQuestions = (messageId: string, isInstalledApp: boolean, installedAppId = '') => { return (getAction('get', isInstalledApp))(getUrl(`/messages/${messageId}/suggested-questions`, isInstalledApp, installedAppId)) } + +export const audioToText = (url: string, isPublicAPI: boolean, body: FormData) => { + return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ text: string }> +} diff --git a/web/types/app.ts b/web/types/app.ts index d0d13fb801..79646caf42 100644 --- a/web/types/app.ts +++ b/web/types/app.ts @@ -85,6 +85,9 @@ export type ModelConfig = { suggested_questions_after_answer: { enabled: boolean } + speech_to_text: { + enabled: boolean + } agent_mode: { enabled: boolean tools: ToolItem[]