feat: add jina tokenizer tool (#7375)

2025-08-14 14:55:54 +08:00 · 2024-08-19 09:15:46 +08:00 · 2024-08-19 09:15:46 +08:00 · a0c689c273
commit a0c689c273
parent bfd905602f
2 changed files with 107 additions and 0 deletions
--- a/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py
+++ b/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py
@ -0,0 +1,43 @@
+from typing import Any
+
+from core.helper import ssrf_proxy
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class JinaTokenizerTool(BuiltinTool):
+    _jina_tokenizer_endpoint = 'https://tokenize.jina.ai/'
+
+    def _invoke(
+        self,
+        user_id: str,
+        tool_parameters: dict[str, Any],
+    ) -> ToolInvokeMessage:
+        content = tool_parameters['content']
+        body = {
+            "content": content
+        }
+
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'):
+            headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key')
+
+        if tool_parameters.get('return_chunks', False):
+            body['return_chunks'] = True
+        
+        if tool_parameters.get('return_tokens', False):
+            body['return_tokens'] = True
+        
+        if tokenizer := tool_parameters.get('tokenizer'):
+            body['tokenizer'] = tokenizer
+
+        response = ssrf_proxy.post(
+            self._jina_tokenizer_endpoint,
+            headers=headers,
+            json=body,
+        )
+
+        return self.create_json_message(response.json())
--- a/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.yaml
+++ b/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.yaml
@ -0,0 +1,64 @@
+identity:
+  name: jina_tokenizer
+  author: hjlarry
+  label:
+    en_US: JinaTokenizer
+description:
+  human:
+    en_US: Free API to tokenize text and segment long text into chunks.
+    zh_Hans: 免费的API可以将文本tokenize，也可以将长文本分割成多个部分。
+  llm: Free API to tokenize text and segment long text into chunks.
+parameters:
+  - name: content
+    type: string
+    required: true
+    label:
+      en_US: Content
+      zh_Hans: 内容
+    llm_description: the content which need to tokenize or segment
+    form: llm
+  - name: return_tokens
+    type: boolean
+    required: false
+    label:
+      en_US: Return the tokens
+      zh_Hans: 是否返回tokens
+    human_description:
+      en_US: Return the tokens and their corresponding ids in the response.
+      zh_Hans: 返回tokens及其对应的ids。
+    form: form
+  - name: return_chunks
+    type: boolean
+    label:
+      en_US: Return the chunks
+      zh_Hans: 是否分块
+    human_description:
+      en_US: Chunking the input into semantically meaningful segments while handling a wide variety of text types and edge cases based on common structural cues.
+      zh_Hans: 将输入分块为具有语义意义的片段，同时根据常见的结构线索处理各种文本类型和边缘情况。
+    form: form
+  - name: tokenizer
+    type: select
+    options:
+      - value: cl100k_base
+        label:
+          en_US: cl100k_base
+      - value: o200k_base
+        label:
+          en_US: o200k_base
+      - value: p50k_base
+        label:
+          en_US: p50k_base
+      - value: r50k_base
+        label:
+          en_US: r50k_base
+      - value: p50k_edit
+        label:
+          en_US: p50k_edit
+      - value: gpt2
+        label:
+          en_US: gpt2
+    label:
+      en_US: Tokenizer
+    human_description:
+      en_US: cl100k_base - gpt-4,gpt-3.5-turbo,gpt-3.5;  o200k_base - gpt-4o,gpt-4o-mini; p50k_base - text-davinci-003,text-davinci-002
+    form: form