diff --git a/api/core/tools/provider/builtin/podcast_generator/_assets/icon.svg b/api/core/tools/provider/builtin/podcast_generator/_assets/icon.svg new file mode 100644 index 0000000000..01743c9cd3 --- /dev/null +++ b/api/core/tools/provider/builtin/podcast_generator/_assets/icon.svg @@ -0,0 +1,24 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/api/core/tools/provider/builtin/podcast_generator/podcast_generator.py b/api/core/tools/provider/builtin/podcast_generator/podcast_generator.py new file mode 100644 index 0000000000..0b9c025834 --- /dev/null +++ b/api/core/tools/provider/builtin/podcast_generator/podcast_generator.py @@ -0,0 +1,33 @@ +from typing import Any + +import openai + +from core.tools.errors import ToolProviderCredentialValidationError +from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController + + +class PodcastGeneratorProvider(BuiltinToolProviderController): + def _validate_credentials(self, credentials: dict[str, Any]) -> None: + tts_service = credentials.get("tts_service") + api_key = credentials.get("api_key") + + if not tts_service: + raise ToolProviderCredentialValidationError("TTS service is not specified") + + if not api_key: + raise ToolProviderCredentialValidationError("API key is missing") + + if tts_service == "openai": + self._validate_openai_credentials(api_key) + else: + raise ToolProviderCredentialValidationError(f"Unsupported TTS service: {tts_service}") + + def _validate_openai_credentials(self, api_key: str) -> None: + client = openai.OpenAI(api_key=api_key) + try: + # We're using a simple API call to validate the credentials + client.models.list() + except openai.AuthenticationError: + raise ToolProviderCredentialValidationError("Invalid OpenAI API key") + except Exception as e: + raise ToolProviderCredentialValidationError(f"Error validating OpenAI API key: {str(e)}") diff --git a/api/core/tools/provider/builtin/podcast_generator/podcast_generator.yaml b/api/core/tools/provider/builtin/podcast_generator/podcast_generator.yaml new file mode 100644 index 0000000000..bd02b32020 --- /dev/null +++ b/api/core/tools/provider/builtin/podcast_generator/podcast_generator.yaml @@ -0,0 +1,34 @@ +identity: + author: Dify + name: podcast_generator + label: + en_US: Podcast Generator + zh_Hans: 播客生成器 + description: + en_US: Generate podcast audio using Text-to-Speech services + zh_Hans: 使用文字转语音服务生成播客音频 + icon: icon.svg +credentials_for_provider: + tts_service: + type: select + required: true + label: + en_US: TTS Service + zh_Hans: TTS 服务 + placeholder: + en_US: Select a TTS service + zh_Hans: 选择一个 TTS 服务 + options: + - label: + en_US: OpenAI TTS + zh_Hans: OpenAI TTS + value: openai + api_key: + type: secret-input + required: true + label: + en_US: API Key + zh_Hans: API 密钥 + placeholder: + en_US: Enter your TTS service API key + zh_Hans: 输入您的 TTS 服务 API 密钥 diff --git a/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.py b/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.py new file mode 100644 index 0000000000..2d83cc0cb7 --- /dev/null +++ b/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.py @@ -0,0 +1,101 @@ +import concurrent.futures +import random +import struct +from typing import Any, Literal, Optional, Union + +import openai + +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.errors import ToolParameterValidationError, ToolProviderCredentialValidationError +from core.tools.tool.builtin_tool import BuiltinTool + + +class PodcastAudioGeneratorTool(BuiltinTool): + @staticmethod + def _generate_silence(duration): + # Generate silent MP3 data + # This is a simplified version and may not work perfectly with all MP3 players + # For production use, consider using a proper audio library or pre-generated silence MP3 + sample_rate = 44100 + num_samples = int(duration * sample_rate) + silence_data = struct.pack("<" + "h" * num_samples, *([0] * num_samples)) + + # Add a simple MP3 header (this is not a complete MP3 file, but might work for basic needs) + mp3_header = b"\xff\xfb\x90\x04" # A very basic MP3 header + return mp3_header + silence_data + + @staticmethod + def _generate_audio_segment( + client: openai.OpenAI, + line: str, + voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"], + index: int, + ) -> tuple[int, Union[bytes, str], Optional[bytes]]: + try: + response = client.audio.speech.create(model="tts-1", voice=voice, input=line.strip()) + audio = response.content + silence_duration = random.uniform(2, 5) + silence = PodcastAudioGeneratorTool._generate_silence(silence_duration) + return index, audio, silence + except Exception as e: + return index, f"Error generating audio: {str(e)}", None + + def _invoke( + self, user_id: str, tool_parameters: dict[str, Any] + ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + # Extract parameters + script = tool_parameters.get("script", "") + host1_voice = tool_parameters.get("host1_voice") + host2_voice = tool_parameters.get("host2_voice") + + # Split the script into lines + script_lines = [line for line in script.split("\n") if line.strip()] + + # Ensure voices are provided + if not host1_voice or not host2_voice: + raise ToolParameterValidationError("Host voices are required") + + # Get OpenAI API key from credentials + if not self.runtime or not self.runtime.credentials: + raise ToolProviderCredentialValidationError("Tool runtime or credentials are missing") + api_key = self.runtime.credentials.get("api_key") + if not api_key: + raise ToolProviderCredentialValidationError("OpenAI API key is missing") + + # Initialize OpenAI client + client = openai.OpenAI(api_key=api_key) + + # Create a thread pool + max_workers = 5 + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [] + for i, line in enumerate(script_lines): + voice = host1_voice if i % 2 == 0 else host2_voice + future = executor.submit(self._generate_audio_segment, client, line, voice, i) + futures.append(future) + + # Collect results + audio_segments: list[Any] = [None] * len(script_lines) + for future in concurrent.futures.as_completed(futures): + index, audio, silence = future.result() + if isinstance(audio, str): # Error occurred + return self.create_text_message(audio) + audio_segments[index] = (audio, silence) + + # Combine audio segments in the correct order + combined_audio = b"" + for i, (audio, silence) in enumerate(audio_segments): + if audio: + combined_audio += audio + if i < len(audio_segments) - 1 and silence: + combined_audio += silence + + # Create a blob message with the combined audio + return [ + self.create_text_message("Audio generated successfully"), + self.create_blob_message( + blob=combined_audio, + meta={"mime_type": "audio/mpeg"}, + save_as=self.VariableKey.AUDIO, + ), + ] diff --git a/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.yaml b/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.yaml new file mode 100644 index 0000000000..d6ae98f595 --- /dev/null +++ b/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.yaml @@ -0,0 +1,95 @@ +identity: + name: podcast_audio_generator + author: Dify + label: + en_US: Podcast Audio Generator + zh_Hans: 播客音频生成器 +description: + human: + en_US: Generate a podcast audio file from a script with two alternating voices using OpenAI's TTS service. + zh_Hans: 使用 OpenAI 的 TTS 服务,从包含两个交替声音的脚本生成播客音频文件。 + llm: This tool converts a prepared podcast script into an audio file using OpenAI's Text-to-Speech service, with two specified voices for alternating hosts. +parameters: + - name: script + type: string + required: true + label: + en_US: Podcast Script + zh_Hans: 播客脚本 + human_description: + en_US: A string containing alternating lines for two hosts, separated by newline characters. + zh_Hans: 包含两位主持人交替台词的字符串,每行用换行符分隔。 + llm_description: A string representing the script, with alternating lines for two hosts separated by newline characters. + form: llm + - name: host1_voice + type: select + required: true + label: + en_US: Host 1 Voice + zh_Hans: 主持人1 音色 + human_description: + en_US: The voice for the first host. + zh_Hans: 第一位主持人的音色。 + llm_description: The voice identifier for the first host's voice. + options: + - label: + en_US: Alloy + zh_Hans: Alloy + value: alloy + - label: + en_US: Echo + zh_Hans: Echo + value: echo + - label: + en_US: Fable + zh_Hans: Fable + value: fable + - label: + en_US: Onyx + zh_Hans: Onyx + value: onyx + - label: + en_US: Nova + zh_Hans: Nova + value: nova + - label: + en_US: Shimmer + zh_Hans: Shimmer + value: shimmer + form: form + - name: host2_voice + type: select + required: true + label: + en_US: Host 2 Voice + zh_Hans: 主持人2 音色 + human_description: + en_US: The voice for the second host. + zh_Hans: 第二位主持人的音色。 + llm_description: The voice identifier for the second host's voice. + options: + - label: + en_US: Alloy + zh_Hans: Alloy + value: alloy + - label: + en_US: Echo + zh_Hans: Echo + value: echo + - label: + en_US: Fable + zh_Hans: Fable + value: fable + - label: + en_US: Onyx + zh_Hans: Onyx + value: onyx + - label: + en_US: Nova + zh_Hans: Nova + value: nova + - label: + en_US: Shimmer + zh_Hans: Shimmer + value: shimmer + form: form