add support for Tencent Cloud ASR (#2102)

### What problem does this PR solve? add support for Tencent Cloud ASR ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2025-07-25 08:44:28 +08:00 · 2024-08-27 11:47:11 +08:00 · 2024-08-27 11:47:11 +08:00 · 2da4e7aa46
commit 2da4e7aa46
parent cf038e099f
13 changed files with 281 additions and 5 deletions
--- a/api/apps/llm_app.py
+++ b/api/apps/llm_app.py
@ -122,6 +122,10 @@ def add_llm():
                        f'"hunyuan_sk": "{req.get("hunyuan_sk", "")}"' + '}'
        req["api_key"] = api_key
        return set_api_key()
+    elif factory == "Tencent Cloud":
+        api_key = '{' + f'"tencent_cloud_sid": "{req.get("tencent_cloud_sid", "")}", ' \
+                f'"tencent_cloud_sk": "{req.get("tencent_cloud_sk", "")}"' + '}'
+        req["api_key"] = api_key
    elif factory == "Bedrock":
        # For Bedrock, due to its special authentication method
        # Assemble bedrock_ak, bedrock_sk, bedrock_region
--- a/conf/llm_factories.json
+++ b/conf/llm_factories.json
@ -3233,6 +3233,13 @@
            "tags": "TTS",
            "status": "1",
            "llm": []
+        },
+        {
+            "name": "Tencent Cloud",
+            "logo": "",
+            "tags": "SPEECH2TEXT",
+            "status": "1",
+            "llm": []
        }
    ]
 }
--- a/rag/llm/init.py
+++ b/rag/llm/init.py
@ -128,7 +128,8 @@ Seq2txtModel = {
    "Tongyi-Qianwen": QWenSeq2txt,
    "Ollama": OllamaSeq2txt,
    "Azure-OpenAI": AzureSeq2txt,
-    "Xinference": XinferenceSeq2txt
+    "Xinference": XinferenceSeq2txt,
+    "Tencent Cloud": TencentCloudSeq2txt
 }

 TTSModel = {
--- a/rag/llm/sequence2txt_model.py
+++ b/rag/llm/sequence2txt_model.py
@ -22,7 +22,8 @@ from openai import OpenAI
 import os
 import json
 from rag.utils import num_tokens_from_string
-
+import base64
+import re

 class Base(ABC):
    def __init__(self, key, model_name):
@ -35,6 +36,13 @@ class Base(ABC):
            response_format="text"
        )
        return transcription.text.strip(), num_tokens_from_string(transcription.text.strip())
+    
+    def audio2base64(self,audio):
+        if isinstance(audio, bytes):
+            return base64.b64encode(audio).decode("utf-8")
+        if isinstance(audio, io.BytesIO):
+            return base64.b64encode(audio.getvalue()).decode("utf-8")
+        raise TypeError("The input audio file should be in binary format.")


 class GPTSeq2txt(Base):
@ -87,3 +95,66 @@ class XinferenceSeq2txt(Base):
    def __init__(self, key, model_name="", base_url=""):
        self.client = OpenAI(api_key="xxx", base_url=base_url)
        self.model_name = model_name
+
+
+class TencentCloudSeq2txt(Base):
+    def __init__(
+        self, key, model_name="16k_zh", base_url="https://asr.tencentcloudapi.com"
+    ):
+        from tencentcloud.common import credential
+        from tencentcloud.asr.v20190614 import asr_client
+
+        key = json.loads(key)
+        sid = key.get("tencent_cloud_sid", "")
+        sk = key.get("tencent_cloud_sk", "")
+        cred = credential.Credential(sid, sk)
+        self.client = asr_client.AsrClient(cred, "")
+        self.model_name = model_name
+
+    def transcription(self, audio, max_retries=60, retry_interval=5):
+        from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
+            TencentCloudSDKException,
+        )
+        from tencentcloud.asr.v20190614 import models
+        import time
+
+        b64 = self.audio2base64(audio)
+        try:
+            # dispatch disk
+            req = models.CreateRecTaskRequest()
+            params = {
+                "EngineModelType": self.model_name,
+                "ChannelNum": 1,
+                "ResTextFormat": 0,
+                "SourceType": 1,
+                "Data": b64,
+            }
+            req.from_json_string(json.dumps(params))
+            resp = self.client.CreateRecTask(req)
+
+            # loop query
+            req = models.DescribeTaskStatusRequest()
+            params = {"TaskId": resp.Data.TaskId}
+            req.from_json_string(json.dumps(params))
+            retries = 0
+            while retries < max_retries:
+                resp = self.client.DescribeTaskStatus(req)
+                if resp.Data.StatusStr == "success":
+                    text = re.sub(
+                        r"\[\d+:\d+\.\d+,\d+:\d+\.\d+\]\s*", "", resp.Data.Result
+                    ).strip()
+                    return text, num_tokens_from_string(text)
+                elif resp.Data.StatusStr == "failed":
+                    return (
+                        "**ERROR**: Failed to retrieve speech recognition results.",
+                        0,
+                    )
+                else:
+                    time.sleep(retry_interval)
+                    retries += 1
+            return "**ERROR**: Max retries exceeded. Task may still be processing.", 0
+
+        except TencentCloudSDKException as e:
+            return "**ERROR**: " + str(e), 0
+        except Exception as e:
+            return "**ERROR**: " + str(e), 0
--- a/web/src/assets/svg/llm/tencent-cloud.svg
+++ b/web/src/assets/svg/llm/tencent-cloud.svg
@ -0,0 +1 @@
+<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1724663790857" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="4238" xmlns:xlink="http://www.w3.org/1999/xlink" width="200" height="200"><path d="M330.24 820.224l471.04-13.824c16.384-5.12 48.64-17.92 76.8-48.128 11.776-12.288 47.616-52.736 46.08-113.664-0.512-18.432-5.12-65.536-46.08-104.448-33.792-32.256-72.192-40.448-87.04-42.496-58.368-8.192-101.888 19.456-112.64 26.624l-199.68 291.84c-49.664 1.536-98.816 3.584-148.48 5.12l291.84-413.696c21.504-11.776 80.384-39.936 158.72-33.28 104.448 9.216 165.888 74.752 179.2 90.112 38.4 43.52 50.176 88.064 56.32 113.664 13.824 55.808 11.264 143.36-35.84 222.72-10.752 17.408-27.648 46.08-61.44 71.68-44.032 33.792-89.088 43.008-112.64 46.592l-552.96 4.608c25.6-34.304 51.2-69.12 76.8-103.424z" fill="#00A3FF" p-id="4239"></path><path d="M219.648 391.68c-45.056 13.824-90.112 27.136-135.168 40.96 14.848-207.36 180.736-367.104 372.736-373.76 165.376-5.632 322.56 103.936 379.904 271.36h-133.12c-42.496-102.4-144.896-166.4-252.416-158.72-116.736 8.704-214.528 100.352-231.936 220.16z" fill="#026FFF" p-id="4240"></path><path d="M519.68 458.24c-24.576-23.552-75.776-66.56-153.6-87.04-29.696-7.68-95.744-24.576-174.08 0-24.064 7.168-85.504 27.136-133.12 87.04C5.12 525.824 3.072 600.064 2.56 632.32c-0.512 28.672-1.536 129.536 76.8 209.92 71.168 72.704 160.768 80.896 184.32 81.92 24.064-35.84 47.616-71.68 71.68-107.52-17.92 4.096-52.736 8.704-93.696-2.048-17.92-5.12-55.296-15.36-85.504-49.152-39.936-44.032-40.448-96.768-40.96-117.76-0.512-19.456-1.024-72.192 35.84-117.76 47.616-58.88 121.344-60.928 138.24-61.44 96.256-3.072 159.232 65.536 168.96 76.8 20.48-29.184 40.96-57.856 61.44-87.04z" fill="#05C8DB" p-id="4241"></path></svg>
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@ -506,6 +506,7 @@ The above is the content you need to summarize.`,
      vision: 'Does it support Vision?',
      ollamaLink: 'How to integrate {{name}}',
      FishAudioLink: 'How to use FishAudio',
+      TencentCloudLink: 'How to use TencentCloud ASR',
      volcModelNameMessage: 'Please input your model name!',
      addEndpointID: 'EndpointID of the model',
      endpointIDMessage: 'Please input your EndpointID of the model',
@ -529,6 +530,10 @@ The above is the content you need to summarize.`,
      HunyuanSIDMessage: 'Please input your Secret ID',
      addHunyuanSK: 'Hunyuan Secret Key',
      HunyuanSKMessage: 'Please input your Secret Key',
+      addTencentCloudSID: 'TencentCloud Secret ID',
+      TencentCloudSIDMessage: 'Please input your Secret ID',
+      addTencentCloudSK: 'TencentCloud Secret Key',
+      TencentCloudSKMessage: 'Please input your Secret Key',
      SparkModelNameMessage: 'Please select Spark model',
      addSparkAPIPassword: 'Spark APIPassword',
      SparkAPIPasswordMessage: 'please input your APIPassword',
--- a/web/src/locales/zh-traditional.ts
+++ b/web/src/locales/zh-traditional.ts
@ -468,6 +468,7 @@ export default {
      baseUrlNameMessage: '請輸入基礎 Url！',
      ollamaLink: '如何集成 {{name}}',
      FishAudioLink: '如何使用Fish Audio',
+      TencentCloudLink: '如何使用騰訊雲語音識別',
      volcModelNameMessage: '請輸入模型名稱！',
      addEndpointID: '模型 EndpointID',
      endpointIDMessage: '請輸入模型對應的EndpointID',
@ -491,6 +492,10 @@ export default {
      HunyuanSIDMessage: '請輸入 Secret ID',
      addHunyuanSK: '混元 Secret Key',
      HunyuanSKMessage: '請輸入 Secret Key',
+      addTencentCloudSID: '騰訊雲 Secret ID',
+      TencentCloudSIDMessage: '請輸入 Secret ID',
+      addTencentCloudSK: '騰訊雲 Secret Key',
+      TencentCloudSKMessage: '請輸入 Secret Key',
      SparkModelNameMessage: '請選擇星火模型!',
      addSparkAPIPassword: '星火 APIPassword',
      SparkAPIPasswordMessage: '請輸入 APIPassword',
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@ -485,6 +485,7 @@ export default {
      baseUrlNameMessage: '请输入基础 Url！',
      ollamaLink: '如何集成 {{name}}',
      FishAudioLink: '如何使用Fish Audio',
+      TencentCloudLink: '如何使用腾讯云语音识别',
      volcModelNameMessage: '请输入模型名称！',
      addEndpointID: '模型 EndpointID',
      endpointIDMessage: '请输入模型对应的EndpointID',
@ -508,6 +509,10 @@ export default {
      HunyuanSIDMessage: '请输入 Secret ID',
      addHunyuanSK: '混元 Secret Key',
      HunyuanSKMessage: '请输入 Secret Key',
+      addTencentCloudSID: '腾讯云 Secret ID',
+      TencentCloudSIDMessage: '请输入 Secret ID',
+      addTencentCloudSK: '腾讯云 Secret Key',
+      TencentCloudSKMessage: '请输入 Secret Key',
      SparkModelNameMessage: '请选择星火模型！',
      addSparkAPIPassword: '星火 APIPassword',
      SparkAPIPasswordMessage: '请输入 APIPassword',
--- a/web/src/pages/user-setting/setting-model/Tencent-modal/index.tsx
+++ b/web/src/pages/user-setting/setting-model/Tencent-modal/index.tsx
@ -0,0 +1,129 @@
+import { useTranslate } from '@/hooks/common-hooks';
+import { IModalProps } from '@/interfaces/common';
+import { IAddLlmRequestBody } from '@/interfaces/request/llm';
+import { Flex, Form, Input, Modal, Select, Space } from 'antd';
+import omit from 'lodash/omit';
+
+type FieldType = IAddLlmRequestBody & {
+  TencentCloud_sid: string;
+  TencentCloud_sk: string;
+};
+
+const { Option } = Select;
+
+const TencentCloudModal = ({
+  visible,
+  hideModal,
+  onOk,
+  loading,
+  llmFactory,
+}: IModalProps<IAddLlmRequestBody> & { llmFactory: string }) => {
+  const [form] = Form.useForm<FieldType>();
+
+  const { t } = useTranslate('setting');
+
+  const handleOk = async () => {
+    const values = await form.validateFields();
+    const modelType = values.model_type;
+
+    const data = {
+      ...omit(values),
+      model_type: modelType,
+      llm_factory: llmFactory,
+    };
+    console.info(data);
+
+    onOk?.(data);
+  };
+
+  return (
+    <Modal
+      title={t('addLlmTitle', { name: llmFactory })}
+      open={visible}
+      onOk={handleOk}
+      onCancel={hideModal}
+      okButtonProps={{ loading }}
+      footer={(originNode: React.ReactNode) => {
+        return (
+          <Flex justify={'space-between'}>
+            <a
+              href={`https://cloud.tencent.com/document/api/1093/37823`}
+              target="_blank"
+              rel="noreferrer"
+            >
+              {t('TencentCloudLink')}
+            </a>
+            <Space>{originNode}</Space>
+          </Flex>
+        );
+      }}
+      confirmLoading={loading}
+    >
+      <Form
+        name="basic"
+        style={{ maxWidth: 600 }}
+        autoComplete="off"
+        layout={'vertical'}
+        form={form}
+      >
+        <Form.Item<FieldType>
+          label={t('modelType')}
+          name="model_type"
+          initialValue={'speech2text'}
+          rules={[{ required: true, message: t('modelTypeMessage') }]}
+        >
+          <Select placeholder={t('modelTypeMessage')}>
+            <Option value="speech2text">speech2text</Option>
+          </Select>
+        </Form.Item>
+        <Form.Item<FieldType>
+          label={t('modelName')}
+          name="llm_name"
+          initialValue={'16k_zh'}
+          rules={[{ required: true, message: t('SparkModelNameMessage') }]}
+        >
+          <Select placeholder={t('modelTypeMessage')}>
+            <Option value="16k_zh">16k_zh</Option>
+            <Option value="16k_zh_large">16k_zh_large</Option>
+            <Option value="16k_multi_lang">16k_multi_lang</Option>
+            <Option value="16k_zh_dialect">16k_zh_dialect</Option>
+            <Option value="16k_en">16k_en</Option>
+            <Option value="16k_yue">16k_yue</Option>
+            <Option value="16k_zh-PY">16k_zh-PY</Option>
+            <Option value="16k_ja">16k_ja</Option>
+            <Option value="16k_ko">16k_ko</Option>
+            <Option value="16k_vi">16k_vi</Option>
+            <Option value="16k_ms">16k_ms</Option>
+            <Option value="16k_id">16k_id</Option>
+            <Option value="16k_fil">16k_fil</Option>
+            <Option value="16k_th">16k_th</Option>
+            <Option value="16k_pt">16k_pt</Option>
+            <Option value="16k_tr">16k_tr</Option>
+            <Option value="16k_ar">16k_ar</Option>
+            <Option value="16k_es">16k_es</Option>
+            <Option value="16k_hi">16k_hi</Option>
+            <Option value="16k_fr">16k_fr</Option>
+            <Option value="16k_zh_medical">16k_zh_medical</Option>
+            <Option value="16k_de">16k_de</Option>
+          </Select>
+        </Form.Item>
+        <Form.Item<FieldType>
+          label={t('addTencentCloudSID')}
+          name="TencentCloud_sid"
+          rules={[{ required: true, message: t('TencentCloudSIDMessage') }]}
+        >
+          <Input placeholder={t('TencentCloudSIDMessage')} />
+        </Form.Item>
+        <Form.Item<FieldType>
+          label={t('addTencentCloudSK')}
+          name="TencentCloud_sk"
+          rules={[{ required: true, message: t('TencentCloudSKMessage') }]}
+        >
+          <Input placeholder={t('TencentCloudSKMessage')} />
+        </Form.Item>
+      </Form>
+    </Modal>
+  );
+};
+
+export default TencentCloudModal;
--- a/web/src/pages/user-setting/setting-model/constant.ts
+++ b/web/src/pages/user-setting/setting-model/constant.ts
@ -36,6 +36,7 @@ export const IconMap = {
  'XunFei Spark': 'spark',
  BaiduYiyan: 'yiyan',
  'Fish Audio': 'fish-audio',
+  'Tencent Cloud': 'tencent-cloud',
 };

 export const BedrockRegionList = [
--- a/web/src/pages/user-setting/setting-model/fish-audio-modal/index.tsx
+++ b/web/src/pages/user-setting/setting-model/fish-audio-modal/index.tsx
@ -81,14 +81,14 @@ const FishAudioModal = ({
        </Form.Item>
        <Form.Item<FieldType>
          label={t('addFishAudioAK')}
-          name="FishAudio_ak"
+          name="fish_audio_ak"
          rules={[{ required: true, message: t('FishAudioAKMessage') }]}
        >
          <Input placeholder={t('FishAudioAKMessage')} />
        </Form.Item>
        <Form.Item<FieldType>
          label={t('addFishAudioRefID')}
-          name="FishAudio_refid"
+          name="fish_audio_refid"
          rules={[{ required: false, message: t('FishAudioRefIDMessage') }]}
        >
          <Input placeholder={t('FishAudioRefIDMessage')} />
--- a/web/src/pages/user-setting/setting-model/hooks.ts
+++ b/web/src/pages/user-setting/setting-model/hooks.ts
@ -190,6 +190,33 @@ export const useSubmitHunyuan = () => {
  };
 };

+export const useSubmitTencentCloud = () => {
+  const { addLlm, loading } = useAddLlm();
+  const {
+    visible: TencentCloudAddingVisible,
+    hideModal: hideTencentCloudAddingModal,
+    showModal: showTencentCloudAddingModal,
+  } = useSetModalState();
+
+  const onTencentCloudAddingOk = useCallback(
+    async (payload: IAddLlmRequestBody) => {
+      const ret = await addLlm(payload);
+      if (ret === 0) {
+        hideTencentCloudAddingModal();
+      }
+    },
+    [hideTencentCloudAddingModal, addLlm],
+  );
+
+  return {
+    TencentCloudAddingLoading: loading,
+    onTencentCloudAddingOk,
+    TencentCloudAddingVisible,
+    hideTencentCloudAddingModal,
+    showTencentCloudAddingModal,
+  };
+};
+
 export const useSubmitSpark = () => {
  const { addLlm, loading } = useAddLlm();
  const {
--- a/web/src/pages/user-setting/setting-model/index.tsx
+++ b/web/src/pages/user-setting/setting-model/index.tsx
@ -27,6 +27,7 @@ import {
 import { useCallback, useMemo } from 'react';
 import SettingTitle from '../components/setting-title';
 import { isLocalLlmFactory } from '../utils';
+import TencentCloudModal from './Tencent-modal';
 import ApiKeyModal from './api-key-modal';
 import BedrockModal from './bedrock-modal';
 import { IconMap } from './constant';
@ -40,6 +41,7 @@ import {
  useSubmitOllama,
  useSubmitSpark,
  useSubmitSystemModelSetting,
+  useSubmitTencentCloud,
  useSubmitVolcEngine,
  useSubmityiyan,
 } from './hooks';
@ -101,7 +103,8 @@ const ModelCard = ({ item, clickApiKey }: IModelCardProps) => {
                item.name === 'Tencent Hunyuan' ||
                item.name === 'XunFei Spark' ||
                item.name === 'BaiduYiyan' ||
-                item.name === 'Fish Audio'
+                item.name === 'Fish Audio' ||
+                item.name === 'Tencent Cloud'
                  ? t('addTheModel')
                  : 'API-Key'}
                <SettingOutlined />
@ -183,6 +186,14 @@ const UserSettingModel = () => {
    HunyuanAddingLoading,
  } = useSubmitHunyuan();

+  const {
+    TencentCloudAddingVisible,
+    hideTencentCloudAddingModal,
+    showTencentCloudAddingModal,
+    onTencentCloudAddingOk,
+    TencentCloudAddingLoading,
+  } = useSubmitTencentCloud();
+
  const {
    SparkAddingVisible,
    hideSparkAddingModal,
@ -223,11 +234,13 @@ const UserSettingModel = () => {
      'XunFei Spark': showSparkAddingModal,
      BaiduYiyan: showyiyanAddingModal,
      'Fish Audio': showFishAudioAddingModal,
+      'Tencent Cloud': showTencentCloudAddingModal,
    }),
    [
      showBedrockAddingModal,
      showVolcAddingModal,
      showHunyuanAddingModal,
+      showTencentCloudAddingModal,
      showSparkAddingModal,
      showyiyanAddingModal,
      showFishAudioAddingModal,
@ -349,6 +362,13 @@ const UserSettingModel = () => {
        loading={HunyuanAddingLoading}
        llmFactory={'Tencent Hunyuan'}
      ></HunyuanModal>
+      <TencentCloudModal
+        visible={TencentCloudAddingVisible}
+        hideModal={hideTencentCloudAddingModal}
+        onOk={onTencentCloudAddingOk}
+        loading={TencentCloudAddingLoading}
+        llmFactory={'Tencent TencentCloud'}
+      ></TencentCloudModal>
      <SparkModal
        visible={SparkAddingVisible}
        hideModal={hideSparkAddingModal}
				`@ -0,0 +1 @@`
				<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1724663790857" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="4238" xmlns:xlink="http://www.w3.org/1999/xlink" width="200" height="200"><path d="M330.24 820.224l471.04-13.824c16.384-5.12 48.64-17.92 76.8-48.128 11.776-12.288 47.616-52.736 46.08-113.664-0.512-18.432-5.12-65.536-46.08-104.448-33.792-32.256-72.192-40.448-87.04-42.496-58.368-8.192-101.888 19.456-112.64 26.624l-199.68 291.84c-49.664 1.536-98.816 3.584-148.48 5.12l291.84-413.696c21.504-11.776 80.384-39.936 158.72-33.28 104.448 9.216 165.888 74.752 179.2 90.112 38.4 43.52 50.176 88.064 56.32 113.664 13.824 55.808 11.264 143.36-35.84 222.72-10.752 17.408-27.648 46.08-61.44 71.68-44.032 33.792-89.088 43.008-112.64 46.592l-552.96 4.608c25.6-34.304 51.2-69.12 76.8-103.424z" fill="#00A3FF" p-id="4239"></path><path d="M219.648 391.68c-45.056 13.824-90.112 27.136-135.168 40.96 14.848-207.36 180.736-367.104 372.736-373.76 165.376-5.632 322.56 103.936 379.904 271.36h-133.12c-42.496-102.4-144.896-166.4-252.416-158.72-116.736 8.704-214.528 100.352-231.936 220.16z" fill="#026FFF" p-id="4240"></path><path d="M519.68 458.24c-24.576-23.552-75.776-66.56-153.6-87.04-29.696-7.68-95.744-24.576-174.08 0-24.064 7.168-85.504 27.136-133.12 87.04C5.12 525.824 3.072 600.064 2.56 632.32c-0.512 28.672-1.536 129.536 76.8 209.92 71.168 72.704 160.768 80.896 184.32 81.92 24.064-35.84 47.616-71.68 71.68-107.52-17.92 4.096-52.736 8.704-93.696-2.048-17.92-5.12-55.296-15.36-85.504-49.152-39.936-44.032-40.448-96.768-40.96-117.76-0.512-19.456-1.024-72.192 35.84-117.76 47.616-58.88 121.344-60.928 138.24-61.44 96.256-3.072 159.232 65.536 168.96 76.8 20.48-29.184 40.96-57.856 61.44-87.04z" fill="#05C8DB" p-id="4241"></path></svg>