feat: add jina tokenizer tool (#7375)

This commit is contained in:
非法操作 2024-08-19 09:15:46 +08:00 committed by GitHub
parent bfd905602f
commit a0c689c273
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 107 additions and 0 deletions

View File

@ -0,0 +1,43 @@
from typing import Any
from core.helper import ssrf_proxy
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool
class JinaTokenizerTool(BuiltinTool):
_jina_tokenizer_endpoint = 'https://tokenize.jina.ai/'
def _invoke(
self,
user_id: str,
tool_parameters: dict[str, Any],
) -> ToolInvokeMessage:
content = tool_parameters['content']
body = {
"content": content
}
headers = {
'Content-Type': 'application/json'
}
if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'):
headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key')
if tool_parameters.get('return_chunks', False):
body['return_chunks'] = True
if tool_parameters.get('return_tokens', False):
body['return_tokens'] = True
if tokenizer := tool_parameters.get('tokenizer'):
body['tokenizer'] = tokenizer
response = ssrf_proxy.post(
self._jina_tokenizer_endpoint,
headers=headers,
json=body,
)
return self.create_json_message(response.json())

View File

@ -0,0 +1,64 @@
identity:
name: jina_tokenizer
author: hjlarry
label:
en_US: JinaTokenizer
description:
human:
en_US: Free API to tokenize text and segment long text into chunks.
zh_Hans: 免费的API可以将文本tokenize也可以将长文本分割成多个部分。
llm: Free API to tokenize text and segment long text into chunks.
parameters:
- name: content
type: string
required: true
label:
en_US: Content
zh_Hans: 内容
llm_description: the content which need to tokenize or segment
form: llm
- name: return_tokens
type: boolean
required: false
label:
en_US: Return the tokens
zh_Hans: 是否返回tokens
human_description:
en_US: Return the tokens and their corresponding ids in the response.
zh_Hans: 返回tokens及其对应的ids。
form: form
- name: return_chunks
type: boolean
label:
en_US: Return the chunks
zh_Hans: 是否分块
human_description:
en_US: Chunking the input into semantically meaningful segments while handling a wide variety of text types and edge cases based on common structural cues.
zh_Hans: 将输入分块为具有语义意义的片段,同时根据常见的结构线索处理各种文本类型和边缘情况。
form: form
- name: tokenizer
type: select
options:
- value: cl100k_base
label:
en_US: cl100k_base
- value: o200k_base
label:
en_US: o200k_base
- value: p50k_base
label:
en_US: p50k_base
- value: r50k_base
label:
en_US: r50k_base
- value: p50k_edit
label:
en_US: p50k_edit
- value: gpt2
label:
en_US: gpt2
label:
en_US: Tokenizer
human_description:
en_US: cl100k_base - gpt-4,gpt-3.5-turbo,gpt-3.5; o200k_base - gpt-4o,gpt-4o-mini; p50k_base - text-davinci-003,text-davinci-002
form: form