feat: add jina tokenizer tool (#7375)

2025-08-19 05:05:53 +08:00 · 2024-08-19 09:15:46 +08:00 · 2024-08-19 09:15:46 +08:00 · a0c689c273
commit a0c689c273
parent bfd905602f
2 changed files with 107 additions and 0 deletions
--- a/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py
+++ b/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py
@ -0,0 +1,43 @@
 from typing import Any
 from core.helper import ssrf_proxy
 from core.tools.entities.tool_entities import ToolInvokeMessage
 from core.tools.tool.builtin_tool import BuiltinTool
 class JinaTokenizerTool(BuiltinTool):
    _jina_tokenizer_endpoint = 'https://tokenize.jina.ai/'
    def _invoke(
        self,
        user_id: str,
        tool_parameters: dict[str, Any],
    ) -> ToolInvokeMessage:
        content = tool_parameters['content']
        body = {
            "content": content
        }
        headers = {
            'Content-Type': 'application/json'
        }
        if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'):
            headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key')
        if tool_parameters.get('return_chunks', False):
            body['return_chunks'] = True
        if tool_parameters.get('return_tokens', False):
            body['return_tokens'] = True
        if tokenizer := tool_parameters.get('tokenizer'):
            body['tokenizer'] = tokenizer
        response = ssrf_proxy.post(
            self._jina_tokenizer_endpoint,
            headers=headers,
            json=body,
        )
        return self.create_json_message(response.json())
--- a/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.yaml
+++ b/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.yaml
@ -0,0 +1,64 @@
 identity:
  name: jina_tokenizer
  author: hjlarry
  label:
    en_US: JinaTokenizer
 description:
  human:
    en_US: Free API to tokenize text and segment long text into chunks.
    zh_Hans: 免费的API可以将文本tokenize，也可以将长文本分割成多个部分。
  llm: Free API to tokenize text and segment long text into chunks.
 parameters:
  - name: content
    type: string
    required: true
    label:
      en_US: Content
      zh_Hans: 内容
    llm_description: the content which need to tokenize or segment
    form: llm
  - name: return_tokens
    type: boolean
    required: false
    label:
      en_US: Return the tokens
      zh_Hans: 是否返回tokens
    human_description:
      en_US: Return the tokens and their corresponding ids in the response.
      zh_Hans: 返回tokens及其对应的ids。
    form: form
  - name: return_chunks
    type: boolean
    label:
      en_US: Return the chunks
      zh_Hans: 是否分块
    human_description:
      en_US: Chunking the input into semantically meaningful segments while handling a wide variety of text types and edge cases based on common structural cues.
      zh_Hans: 将输入分块为具有语义意义的片段，同时根据常见的结构线索处理各种文本类型和边缘情况。
    form: form
  - name: tokenizer
    type: select
    options:
      - value: cl100k_base
        label:
          en_US: cl100k_base
      - value: o200k_base
        label:
          en_US: o200k_base
      - value: p50k_base
        label:
          en_US: p50k_base
      - value: r50k_base
        label:
          en_US: r50k_base
      - value: p50k_edit
        label:
          en_US: p50k_edit
      - value: gpt2
        label:
          en_US: gpt2
    label:
      en_US: Tokenizer
    human_description:
      en_US: cl100k_base - gpt-4,gpt-3.5-turbo,gpt-3.5;  o200k_base - gpt-4o,gpt-4o-mini; p50k_base - text-davinci-003,text-davinci-002
    form: form