diff --git a/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py b/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py new file mode 100644 index 0000000000..0d018e3ca2 --- /dev/null +++ b/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py @@ -0,0 +1,43 @@ +from typing import Any + +from core.helper import ssrf_proxy +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.tool.builtin_tool import BuiltinTool + + +class JinaTokenizerTool(BuiltinTool): + _jina_tokenizer_endpoint = 'https://tokenize.jina.ai/' + + def _invoke( + self, + user_id: str, + tool_parameters: dict[str, Any], + ) -> ToolInvokeMessage: + content = tool_parameters['content'] + body = { + "content": content + } + + headers = { + 'Content-Type': 'application/json' + } + + if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'): + headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key') + + if tool_parameters.get('return_chunks', False): + body['return_chunks'] = True + + if tool_parameters.get('return_tokens', False): + body['return_tokens'] = True + + if tokenizer := tool_parameters.get('tokenizer'): + body['tokenizer'] = tokenizer + + response = ssrf_proxy.post( + self._jina_tokenizer_endpoint, + headers=headers, + json=body, + ) + + return self.create_json_message(response.json()) diff --git a/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.yaml b/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.yaml new file mode 100644 index 0000000000..807aeec39b --- /dev/null +++ b/api/core/tools/provider/builtin/jina/tools/jina_tokenizer.yaml @@ -0,0 +1,64 @@ +identity: + name: jina_tokenizer + author: hjlarry + label: + en_US: JinaTokenizer +description: + human: + en_US: Free API to tokenize text and segment long text into chunks. + zh_Hans: 免费的API可以将文本tokenize,也可以将长文本分割成多个部分。 + llm: Free API to tokenize text and segment long text into chunks. +parameters: + - name: content + type: string + required: true + label: + en_US: Content + zh_Hans: 内容 + llm_description: the content which need to tokenize or segment + form: llm + - name: return_tokens + type: boolean + required: false + label: + en_US: Return the tokens + zh_Hans: 是否返回tokens + human_description: + en_US: Return the tokens and their corresponding ids in the response. + zh_Hans: 返回tokens及其对应的ids。 + form: form + - name: return_chunks + type: boolean + label: + en_US: Return the chunks + zh_Hans: 是否分块 + human_description: + en_US: Chunking the input into semantically meaningful segments while handling a wide variety of text types and edge cases based on common structural cues. + zh_Hans: 将输入分块为具有语义意义的片段,同时根据常见的结构线索处理各种文本类型和边缘情况。 + form: form + - name: tokenizer + type: select + options: + - value: cl100k_base + label: + en_US: cl100k_base + - value: o200k_base + label: + en_US: o200k_base + - value: p50k_base + label: + en_US: p50k_base + - value: r50k_base + label: + en_US: r50k_base + - value: p50k_edit + label: + en_US: p50k_edit + - value: gpt2 + label: + en_US: gpt2 + label: + en_US: Tokenizer + human_description: + en_US: cl100k_base - gpt-4,gpt-3.5-turbo,gpt-3.5; o200k_base - gpt-4o,gpt-4o-mini; p50k_base - text-davinci-003,text-davinci-002 + form: form