feature. add feat to modify metadata via dataset api (#13116)

This commit is contained in:
aplio 2025-02-02 16:27:12 +09:00 committed by GitHub
parent c2664e0283
commit d73d191f99
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 522 additions and 0 deletions

View File

@ -18,6 +18,7 @@ from controllers.service_api.app.error import (
from controllers.service_api.dataset.error import ( from controllers.service_api.dataset.error import (
ArchivedDocumentImmutableError, ArchivedDocumentImmutableError,
DocumentIndexingError, DocumentIndexingError,
InvalidMetadataError,
) )
from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
from core.errors.error import ProviderTokenNotInitError from core.errors.error import ProviderTokenNotInitError
@ -50,6 +51,9 @@ class DocumentAddByTextApi(DatasetApiResource):
"indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json" "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
) )
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args() args = parser.parse_args()
dataset_id = str(dataset_id) dataset_id = str(dataset_id)
tenant_id = str(tenant_id) tenant_id = str(tenant_id)
@ -61,6 +65,28 @@ class DocumentAddByTextApi(DatasetApiResource):
if not dataset.indexing_technique and not args["indexing_technique"]: if not dataset.indexing_technique and not args["indexing_technique"]:
raise ValueError("indexing_technique is required.") raise ValueError("indexing_technique is required.")
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
text = args.get("text") text = args.get("text")
name = args.get("name") name = args.get("name")
if text is None or name is None: if text is None or name is None:
@ -107,6 +133,8 @@ class DocumentUpdateByTextApi(DatasetApiResource):
"doc_language", type=str, default="English", required=False, nullable=False, location="json" "doc_language", type=str, default="English", required=False, nullable=False, location="json"
) )
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args() args = parser.parse_args()
dataset_id = str(dataset_id) dataset_id = str(dataset_id)
tenant_id = str(tenant_id) tenant_id = str(tenant_id)
@ -115,6 +143,29 @@ class DocumentUpdateByTextApi(DatasetApiResource):
if not dataset: if not dataset:
raise ValueError("Dataset is not exist.") raise ValueError("Dataset is not exist.")
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
if args["text"]: if args["text"]:
text = args.get("text") text = args.get("text")
name = args.get("name") name = args.get("name")
@ -161,6 +212,30 @@ class DocumentAddByFileApi(DatasetApiResource):
args["doc_form"] = "text_model" args["doc_form"] = "text_model"
if "doc_language" not in args: if "doc_language" not in args:
args["doc_language"] = "English" args["doc_language"] = "English"
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
# get dataset info # get dataset info
dataset_id = str(dataset_id) dataset_id = str(dataset_id)
tenant_id = str(tenant_id) tenant_id = str(tenant_id)
@ -228,6 +303,29 @@ class DocumentUpdateByFileApi(DatasetApiResource):
if "doc_language" not in args: if "doc_language" not in args:
args["doc_language"] = "English" args["doc_language"] = "English"
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
# get dataset info # get dataset info
dataset_id = str(dataset_id) dataset_id = str(dataset_id)
tenant_id = str(tenant_id) tenant_id = str(tenant_id)

View File

@ -42,6 +42,7 @@ from models.source import DataSourceOauthBinding
from services.entities.knowledge_entities.knowledge_entities import ( from services.entities.knowledge_entities.knowledge_entities import (
ChildChunkUpdateArgs, ChildChunkUpdateArgs,
KnowledgeConfig, KnowledgeConfig,
MetaDataConfig,
RerankingModel, RerankingModel,
RetrievalModel, RetrievalModel,
SegmentUpdateArgs, SegmentUpdateArgs,
@ -894,6 +895,9 @@ class DocumentService:
document.data_source_info = json.dumps(data_source_info) document.data_source_info = json.dumps(data_source_info)
document.batch = batch document.batch = batch
document.indexing_status = "waiting" document.indexing_status = "waiting"
if knowledge_config.metadata:
document.doc_type = knowledge_config.metadata.doc_type
document.metadata = knowledge_config.metadata.doc_metadata
db.session.add(document) db.session.add(document)
documents.append(document) documents.append(document)
duplicate_document_ids.append(document.id) duplicate_document_ids.append(document.id)
@ -910,6 +914,7 @@ class DocumentService:
account, account,
file_name, file_name,
batch, batch,
knowledge_config.metadata,
) )
db.session.add(document) db.session.add(document)
db.session.flush() db.session.flush()
@ -965,6 +970,7 @@ class DocumentService:
account, account,
page.page_name, page.page_name,
batch, batch,
knowledge_config.metadata,
) )
db.session.add(document) db.session.add(document)
db.session.flush() db.session.flush()
@ -1005,6 +1011,7 @@ class DocumentService:
account, account,
document_name, document_name,
batch, batch,
knowledge_config.metadata,
) )
db.session.add(document) db.session.add(document)
db.session.flush() db.session.flush()
@ -1042,6 +1049,7 @@ class DocumentService:
account: Account, account: Account,
name: str, name: str,
batch: str, batch: str,
metadata: Optional[MetaDataConfig] = None,
): ):
document = Document( document = Document(
tenant_id=dataset.tenant_id, tenant_id=dataset.tenant_id,
@ -1057,6 +1065,9 @@ class DocumentService:
doc_form=document_form, doc_form=document_form,
doc_language=document_language, doc_language=document_language,
) )
if metadata is not None:
document.doc_metadata = metadata.doc_metadata
document.doc_type = metadata.doc_type
return document return document
@staticmethod @staticmethod
@ -1169,6 +1180,10 @@ class DocumentService:
# update document name # update document name
if document_data.name: if document_data.name:
document.name = document_data.name document.name = document_data.name
# update doc_type and doc_metadata if provided
if document_data.metadata is not None:
document.doc_metadata = document_data.metadata.doc_type
document.doc_type = document_data.metadata.doc_type
# update document to be waiting # update document to be waiting
document.indexing_status = "waiting" document.indexing_status = "waiting"
document.completed_at = None document.completed_at = None

View File

@ -93,6 +93,11 @@ class RetrievalModel(BaseModel):
score_threshold: Optional[float] = None score_threshold: Optional[float] = None
class MetaDataConfig(BaseModel):
doc_type: str
doc_metadata: dict
class KnowledgeConfig(BaseModel): class KnowledgeConfig(BaseModel):
original_document_id: Optional[str] = None original_document_id: Optional[str] = None
duplicate: bool = True duplicate: bool = True
@ -105,6 +110,7 @@ class KnowledgeConfig(BaseModel):
embedding_model: Optional[str] = None embedding_model: Optional[str] = None
embedding_model_provider: Optional[str] = None embedding_model_provider: Optional[str] = None
name: Optional[str] = None name: Optional[str] = None
metadata: Optional[MetaDataConfig] = None
class SegmentUpdateArgs(BaseModel): class SegmentUpdateArgs(BaseModel):

View File

@ -47,6 +47,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<Property name='text' type='string' key='text'> <Property name='text' type='string' key='text'>
Document content Document content
</Property> </Property>
<Property name='doc_type' type='string' key='doc_type'>
Type of document (optional):
- <code>book</code> Book
- <code>web_page</code> Web page
- <code>paper</code> Academic paper/article
- <code>social_media_post</code> Social media post
- <code>wikipedia_entry</code> Wikipedia entry
- <code>personal_document</code> Personal document
- <code>business_document</code> Business document
- <code>im_chat_log</code> Chat log
- <code>synced_from_notion</code> Notion document
- <code>synced_from_github</code> GitHub document
- <code>others</code> Other document types
</Property>
<Property name='doc_metadata' type='object' key='doc_metadata'>
Document metadata (required if doc_type is provided). Fields vary by doc_type:
For <code>book</code>:
- <code>title</code> Book title
- <code>language</code> Book language
- <code>author</code> Book author
- <code>publisher</code> Publisher name
- <code>publication_date</code> Publication date
- <code>isbn</code> ISBN number
- <code>category</code> Book category
For <code>web_page</code>:
- <code>title</code> Page title
- <code>url</code> Page URL
- <code>language</code> Page language
- <code>publish_date</code> Publish date
- <code>author/publisher</code> Author or publisher
- <code>topic/keywords</code> Topic or keywords
- <code>description</code> Page description
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
For doc_type "others", any valid JSON object is accepted
</Property>
<Property name='indexing_technique' type='string' key='indexing_technique'> <Property name='indexing_technique' type='string' key='indexing_technique'>
Index mode Index mode
- <code>high_quality</code> High quality: embedding using embedding model, built as vector database index - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
@ -195,6 +233,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- <code>hierarchical_model</code> Parent-child mode - <code>hierarchical_model</code> Parent-child mode
- <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
- <code>doc_type</code> Type of document (optional)
- <code>book</code> Book
Document records a book or publication
- <code>web_page</code> Web page
Document records web page content
- <code>paper</code> Academic paper/article
Document records academic paper or research article
- <code>social_media_post</code> Social media post
Content from social media posts
- <code>wikipedia_entry</code> Wikipedia entry
Content from Wikipedia entries
- <code>personal_document</code> Personal document
Documents related to personal content
- <code>business_document</code> Business document
Documents related to business content
- <code>im_chat_log</code> Chat log
Records of instant messaging chats
- <code>synced_from_notion</code> Notion document
Documents synchronized from Notion
- <code>synced_from_github</code> GitHub document
Documents synchronized from GitHub
- <code>others</code> Other document types
Other document types not listed above
- <code>doc_metadata</code> Document metadata (required if doc_type is provided)
Fields vary by doc_type:
For <code>book</code>:
- <code>title</code> Book title
Title of the book
- <code>language</code> Book language
Language of the book
- <code>author</code> Book author
Author of the book
- <code>publisher</code> Publisher name
Name of the publishing house
- <code>publication_date</code> Publication date
Date when the book was published
- <code>isbn</code> ISBN number
International Standard Book Number
- <code>category</code> Book category
Category or genre of the book
For <code>web_page</code>:
- <code>title</code> Page title
Title of the web page
- <code>url</code> Page URL
URL address of the web page
- <code>language</code> Page language
Language of the web page
- <code>publish_date</code> Publish date
Date when the web page was published
- <code>author/publisher</code> Author or publisher
Author or publisher of the web page
- <code>topic/keywords</code> Topic or keywords
Topics or keywords of the web page
- <code>description</code> Page description
Description of the web page content
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
For doc_type "others", any valid JSON object is accepted
- <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code> - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
- <code>process_rule</code> Processing rules - <code>process_rule</code> Processing rules
@ -307,6 +407,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<Property name='description' type='string' key='description'> <Property name='description' type='string' key='description'>
Knowledge description (optional) Knowledge description (optional)
</Property> </Property>
<Property name='doc_type' type='string' key='doc_type'>
Type of document (optional):
- <code>book</code> Book
- <code>web_page</code> Web page
- <code>paper</code> Academic paper/article
- <code>social_media_post</code> Social media post
- <code>wikipedia_entry</code> Wikipedia entry
- <code>personal_document</code> Personal document
- <code>business_document</code> Business document
- <code>im_chat_log</code> Chat log
- <code>synced_from_notion</code> Notion document
- <code>synced_from_github</code> GitHub document
- <code>others</code> Other document types
</Property>
<Property name='doc_metadata' type='object' key='doc_metadata'>
Document metadata (required if doc_type is provided). Fields vary by doc_type:
For <code>book</code>:
- <code>title</code> Book title
- <code>language</code> Book language
- <code>author</code> Book author
- <code>publisher</code> Publisher name
- <code>publication_date</code> Publication date
- <code>isbn</code> ISBN number
- <code>category</code> Book category
For <code>web_page</code>:
- <code>title</code> Page title
- <code>url</code> Page URL
- <code>language</code> Page language
- <code>publish_date</code> Publish date
- <code>author/publisher</code> Author or publisher
- <code>topic/keywords</code> Topic or keywords
- <code>description</code> Page description
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
For doc_type "others", any valid JSON object is accepted
</Property>
<Property name='indexing_technique' type='string' key='indexing_technique'> <Property name='indexing_technique' type='string' key='indexing_technique'>
Index technique (optional) Index technique (optional)
- <code>high_quality</code> High quality - <code>high_quality</code> High quality
@ -624,6 +762,67 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code> - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
- <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
- <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional) - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
- <code>doc_type</code> Type of document (optional)
- <code>book</code> Book
Document records a book or publication
- <code>web_page</code> Web page
Document records web page content
- <code>paper</code> Academic paper/article
Document records academic paper or research article
- <code>social_media_post</code> Social media post
Content from social media posts
- <code>wikipedia_entry</code> Wikipedia entry
Content from Wikipedia entries
- <code>personal_document</code> Personal document
Documents related to personal content
- <code>business_document</code> Business document
Documents related to business content
- <code>im_chat_log</code> Chat log
Records of instant messaging chats
- <code>synced_from_notion</code> Notion document
Documents synchronized from Notion
- <code>synced_from_github</code> GitHub document
Documents synchronized from GitHub
- <code>others</code> Other document types
Other document types not listed above
- <code>doc_metadata</code> Document metadata (required if doc_type is provided)
Fields vary by doc_type:
For <code>book</code>:
- <code>title</code> Book title
Title of the book
- <code>language</code> Book language
Language of the book
- <code>author</code> Book author
Author of the book
- <code>publisher</code> Publisher name
Name of the publishing house
- <code>publication_date</code> Publication date
Date when the book was published
- <code>isbn</code> ISBN number
International Standard Book Number
- <code>category</code> Book category
Category or genre of the book
For <code>web_page</code>:
- <code>title</code> Page title
Title of the web page
- <code>url</code> Page URL
URL address of the web page
- <code>language</code> Page language
Language of the web page
- <code>publish_date</code> Publish date
Date when the web page was published
- <code>author/publisher</code> Author or publisher
Author or publisher of the web page
- <code>topic/keywords</code> Topic or keywords
Topics or keywords of the web page
- <code>description</code> Page description
Description of the web page content
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
For doc_type "others", any valid JSON object is accepted
</Property> </Property>
</Properties> </Properties>
</Col> </Col>

View File

@ -47,6 +47,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<Property name='text' type='string' key='text'> <Property name='text' type='string' key='text'>
文档内容 文档内容
</Property> </Property>
<Property name='doc_type' type='string' key='doc_type'>
文档类型(选填)
- <code>book</code> 图书 Book
- <code>web_page</code> 网页 Web page
- <code>paper</code> 学术论文/文章 Academic paper/article
- <code>social_media_post</code> 社交媒体帖子 Social media post
- <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
- <code>personal_document</code> 个人文档 Personal document
- <code>business_document</code> 商业文档 Business document
- <code>im_chat_log</code> 即时通讯记录 Chat log
- <code>synced_from_notion</code> Notion同步文档 Notion document
- <code>synced_from_github</code> GitHub同步文档 GitHub document
- <code>others</code> 其他文档类型 Other document types
</Property>
<Property name='doc_metadata' type='object' key='doc_metadata'>
文档元数据(如提供文档类型则必填)。字段因文档类型而异:
针对图书 For <code>book</code>:
- <code>title</code> 书名 Book title
- <code>language</code> 图书语言 Book language
- <code>author</code> 作者 Book author
- <code>publisher</code> 出版社 Publisher name
- <code>publication_date</code> 出版日期 Publication date
- <code>isbn</code> ISBN号码 ISBN number
- <code>category</code> 图书分类 Book category
针对网页 For <code>web_page</code>:
- <code>title</code> 页面标题 Page title
- <code>url</code> 页面网址 Page URL
- <code>language</code> 页面语言 Page language
- <code>publish_date</code> 发布日期 Publish date
- <code>author/publisher</code> 作者/发布者 Author or publisher
- <code>topic/keywords</code> 主题/关键词 Topic or keywords
- <code>description</code> 页面描述 Page description
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
针对"其他"类型文档接受任何有效的JSON对象
</Property>
<Property name='indexing_technique' type='string' key='indexing_technique'> <Property name='indexing_technique' type='string' key='indexing_technique'>
索引方式 索引方式
- <code>high_quality</code> 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 - <code>high_quality</code> 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引
@ -194,6 +234,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- <code>text_model</code> text 文档直接 embedding经济模式默认为该模式 - <code>text_model</code> text 文档直接 embedding经济模式默认为该模式
- <code>hierarchical_model</code> parent-child 模式 - <code>hierarchical_model</code> parent-child 模式
- <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding - <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
- <code>doc_type</code> 文档类型选填Type of document (optional)
- <code>book</code> 图书
文档记录一本书籍或出版物
- <code>web_page</code> 网页
网页内容的文档记录
- <code>paper</code> 学术论文/文章
学术论文或研究文章的记录
- <code>social_media_post</code> 社交媒体帖子
社交媒体上的帖子内容
- <code>wikipedia_entry</code> 维基百科条目
维基百科的词条内容
- <code>personal_document</code> 个人文档
个人相关的文档记录
- <code>business_document</code> 商业文档
商业相关的文档记录
- <code>im_chat_log</code> 即时通讯记录
即时通讯的聊天记录
- <code>synced_from_notion</code> Notion同步文档
从Notion同步的文档内容
- <code>synced_from_github</code> GitHub同步文档
从GitHub同步的文档内容
- <code>others</code> 其他文档类型
其他未列出的文档类型
- <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
字段因文档类型而异
针对图书类型 For <code>book</code>:
- <code>title</code> 书名
书籍的标题
- <code>language</code> 图书语言
书籍的语言
- <code>author</code> 作者
书籍的作者
- <code>publisher</code> 出版社
出版社的名称
- <code>publication_date</code> 出版日期
书籍的出版日期
- <code>isbn</code> ISBN号码
书籍的ISBN编号
- <code>category</code> 图书分类
书籍的分类类别
针对网页类型 For <code>web_page</code>:
- <code>title</code> 页面标题
网页的标题
- <code>url</code> 页面网址
网页的URL地址
- <code>language</code> 页面语言
网页的语言
- <code>publish_date</code> 发布日期
网页的发布日期
- <code>author/publisher</code> 作者/发布者
网页的作者或发布者
- <code>topic/keywords</code> 主题/关键词
网页的主题或关键词
- <code>description</code> 页面描述
网页的描述信息
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
针对"其他"类型文档接受任何有效的JSON对象
- <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code> - <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code>
@ -504,6 +606,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<Property name='text' type='string' key='text'> <Property name='text' type='string' key='text'>
文档内容(选填) 文档内容(选填)
</Property> </Property>
<Property name='doc_type' type='string' key='doc_type'>
文档类型(选填)
- <code>book</code> 图书 Book
- <code>web_page</code> 网页 Web page
- <code>paper</code> 学术论文/文章 Academic paper/article
- <code>social_media_post</code> 社交媒体帖子 Social media post
- <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
- <code>personal_document</code> 个人文档 Personal document
- <code>business_document</code> 商业文档 Business document
- <code>im_chat_log</code> 即时通讯记录 Chat log
- <code>synced_from_notion</code> Notion同步文档 Notion document
- <code>synced_from_github</code> GitHub同步文档 GitHub document
- <code>others</code> 其他文档类型 Other document types
</Property>
<Property name='doc_metadata' type='object' key='doc_metadata'>
文档元数据(如提供文档类型则必填)。字段因文档类型而异:
针对图书 For <code>book</code>:
- <code>title</code> 书名 Book title
- <code>language</code> 图书语言 Book language
- <code>author</code> 作者 Book author
- <code>publisher</code> 出版社 Publisher name
- <code>publication_date</code> 出版日期 Publication date
- <code>isbn</code> ISBN号码 ISBN number
- <code>category</code> 图书分类 Book category
针对网页 For <code>web_page</code>:
- <code>title</code> 页面标题 Page title
- <code>url</code> 页面网址 Page URL
- <code>language</code> 页面语言 Page language
- <code>publish_date</code> 发布日期 Publish date
- <code>author/publisher</code> 作者/发布者 Author or publisher
- <code>topic/keywords</code> 主题/关键词 Topic or keywords
- <code>description</code> 页面描述 Page description
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
针对"其他"类型文档接受任何有效的JSON对象
</Property>
<Property name='process_rule' type='object' key='process_rule'> <Property name='process_rule' type='object' key='process_rule'>
处理规则(选填) 处理规则(选填)
- <code>mode</code> (string) 清洗、分段模式 automatic 自动 / custom 自定义 - <code>mode</code> (string) 清洗、分段模式 automatic 自动 / custom 自定义
@ -624,6 +766,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code> - <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code>
- <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度 - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
- <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填) - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填)
- <code>doc_type</code> 文档类型选填Type of document (optional)
- <code>book</code> 图书
文档记录一本书籍或出版物
- <code>web_page</code> 网页
网页内容的文档记录
- <code>paper</code> 学术论文/文章
学术论文或研究文章的记录
- <code>social_media_post</code> 社交媒体帖子
社交媒体上的帖子内容
- <code>wikipedia_entry</code> 维基百科条目
维基百科的词条内容
- <code>personal_document</code> 个人文档
个人相关的文档记录
- <code>business_document</code> 商业文档
商业相关的文档记录
- <code>im_chat_log</code> 即时通讯记录
即时通讯的聊天记录
- <code>synced_from_notion</code> Notion同步文档
从Notion同步的文档内容
- <code>synced_from_github</code> GitHub同步文档
从GitHub同步的文档内容
- <code>others</code> 其他文档类型
其他未列出的文档类型
- <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
字段因文档类型而异
针对图书类型 For <code>book</code>:
- <code>title</code> 书名
书籍的标题
- <code>language</code> 图书语言
书籍的语言
- <code>author</code> 作者
书籍的作者
- <code>publisher</code> 出版社
出版社的名称
- <code>publication_date</code> 出版日期
书籍的出版日期
- <code>isbn</code> ISBN号码
书籍的ISBN编号
- <code>category</code> 图书分类
书籍的分类类别
针对网页类型 For <code>web_page</code>:
- <code>title</code> 页面标题
网页的标题
- <code>url</code> 页面网址
网页的URL地址
- <code>language</code> 页面语言
网页的语言
- <code>publish_date</code> 发布日期
网页的发布日期
- <code>author/publisher</code> 作者/发布者
网页的作者或发布者
- <code>topic/keywords</code> 主题/关键词
网页的主题或关键词
- <code>description</code> 页面描述
网页的描述信息
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
针对"其他"类型文档接受任何有效的JSON对象
</Property> </Property>
</Properties> </Properties>
</Col> </Col>