feature. add feat to modify metadata via dataset api (#13116)

2025-08-12 05:28:59 +08:00 · 2025-02-02 16:27:12 +09:00 · 2025-02-02 16:27:12 +09:00 · d73d191f99
commit d73d191f99
parent c2664e0283
5 changed files with 522 additions and 0 deletions
--- a/api/controllers/service_api/dataset/document.py
+++ b/api/controllers/service_api/dataset/document.py
@ -18,6 +18,7 @@ from controllers.service_api.app.error import (
 from controllers.service_api.dataset.error import (
    ArchivedDocumentImmutableError,
    DocumentIndexingError,
    InvalidMetadataError,
 )
 from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
 from core.errors.error import ProviderTokenNotInitError
@ -50,6 +51,9 @@ class DocumentAddByTextApi(DatasetApiResource):
            "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
        )
        parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
        args = parser.parse_args()
        dataset_id = str(dataset_id)
        tenant_id = str(tenant_id)
@ -61,6 +65,28 @@ class DocumentAddByTextApi(DatasetApiResource):
        if not dataset.indexing_technique and not args["indexing_technique"]:
            raise ValueError("indexing_technique is required.")
        # Validate metadata if provided
        if args.get("doc_type") or args.get("doc_metadata"):
            if not args.get("doc_type") or not args.get("doc_metadata"):
                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
                raise InvalidMetadataError(
                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
                )
            if not isinstance(args["doc_metadata"], dict):
                raise InvalidMetadataError("doc_metadata must be a dictionary")
            # Validate metadata schema based on doc_type
            if args["doc_type"] != "others":
                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
                for key, value in args["doc_metadata"].items():
                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
            # set to MetaDataConfig
            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
        text = args.get("text")
        name = args.get("name")
        if text is None or name is None:
@ -107,6 +133,8 @@ class DocumentUpdateByTextApi(DatasetApiResource):
            "doc_language", type=str, default="English", required=False, nullable=False, location="json"
        )
        parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
        args = parser.parse_args()
        dataset_id = str(dataset_id)
        tenant_id = str(tenant_id)
@ -115,6 +143,29 @@ class DocumentUpdateByTextApi(DatasetApiResource):
        if not dataset:
            raise ValueError("Dataset is not exist.")
        # Validate metadata if provided
        if args.get("doc_type") or args.get("doc_metadata"):
            if not args.get("doc_type") or not args.get("doc_metadata"):
                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
                raise InvalidMetadataError(
                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
                )
            if not isinstance(args["doc_metadata"], dict):
                raise InvalidMetadataError("doc_metadata must be a dictionary")
            # Validate metadata schema based on doc_type
            if args["doc_type"] != "others":
                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
                for key, value in args["doc_metadata"].items():
                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
            # set to MetaDataConfig
            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
        if args["text"]:
            text = args.get("text")
            name = args.get("name")
@ -161,6 +212,30 @@ class DocumentAddByFileApi(DatasetApiResource):
            args["doc_form"] = "text_model"
        if "doc_language" not in args:
            args["doc_language"] = "English"
        # Validate metadata if provided
        if args.get("doc_type") or args.get("doc_metadata"):
            if not args.get("doc_type") or not args.get("doc_metadata"):
                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
                raise InvalidMetadataError(
                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
                )
            if not isinstance(args["doc_metadata"], dict):
                raise InvalidMetadataError("doc_metadata must be a dictionary")
            # Validate metadata schema based on doc_type
            if args["doc_type"] != "others":
                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
                for key, value in args["doc_metadata"].items():
                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
            # set to MetaDataConfig
            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
        # get dataset info
        dataset_id = str(dataset_id)
        tenant_id = str(tenant_id)
@ -228,6 +303,29 @@ class DocumentUpdateByFileApi(DatasetApiResource):
        if "doc_language" not in args:
            args["doc_language"] = "English"
        # Validate metadata if provided
        if args.get("doc_type") or args.get("doc_metadata"):
            if not args.get("doc_type") or not args.get("doc_metadata"):
                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
                raise InvalidMetadataError(
                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
                )
            if not isinstance(args["doc_metadata"], dict):
                raise InvalidMetadataError("doc_metadata must be a dictionary")
            # Validate metadata schema based on doc_type
            if args["doc_type"] != "others":
                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
                for key, value in args["doc_metadata"].items():
                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
            # set to MetaDataConfig
            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
        # get dataset info
        dataset_id = str(dataset_id)
        tenant_id = str(tenant_id)
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@ -42,6 +42,7 @@ from models.source import DataSourceOauthBinding
 from services.entities.knowledge_entities.knowledge_entities import (
    ChildChunkUpdateArgs,
    KnowledgeConfig,
    MetaDataConfig,
    RerankingModel,
    RetrievalModel,
    SegmentUpdateArgs,
@ -894,6 +895,9 @@ class DocumentService:
                                document.data_source_info = json.dumps(data_source_info)
                                document.batch = batch
                                document.indexing_status = "waiting"
                                if knowledge_config.metadata:
                                    document.doc_type = knowledge_config.metadata.doc_type
                                    document.metadata = knowledge_config.metadata.doc_metadata
                                db.session.add(document)
                                documents.append(document)
                                duplicate_document_ids.append(document.id)
@ -910,6 +914,7 @@ class DocumentService:
                            account,
                            file_name,
                            batch,
                            knowledge_config.metadata,
                        )
                        db.session.add(document)
                        db.session.flush()
@ -965,6 +970,7 @@ class DocumentService:
                                    account,
                                    page.page_name,
                                    batch,
                                    knowledge_config.metadata,
                                )
                                db.session.add(document)
                                db.session.flush()
@ -1005,6 +1011,7 @@ class DocumentService:
                            account,
                            document_name,
                            batch,
                            knowledge_config.metadata,
                        )
                        db.session.add(document)
                        db.session.flush()
@ -1042,6 +1049,7 @@ class DocumentService:
        account: Account,
        name: str,
        batch: str,
        metadata: Optional[MetaDataConfig] = None,
    ):
        document = Document(
            tenant_id=dataset.tenant_id,
@ -1057,6 +1065,9 @@ class DocumentService:
            doc_form=document_form,
            doc_language=document_language,
        )
        if metadata is not None:
            document.doc_metadata = metadata.doc_metadata
            document.doc_type = metadata.doc_type
        return document
    @staticmethod
@ -1169,6 +1180,10 @@ class DocumentService:
        # update document name
        if document_data.name:
            document.name = document_data.name
        # update doc_type and doc_metadata if provided
        if document_data.metadata is not None:
            document.doc_metadata = document_data.metadata.doc_type
            document.doc_type = document_data.metadata.doc_type
        # update document to be waiting
        document.indexing_status = "waiting"
        document.completed_at = None
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
@ -93,6 +93,11 @@ class RetrievalModel(BaseModel):
    score_threshold: Optional[float] = None
 class MetaDataConfig(BaseModel):
    doc_type: str
    doc_metadata: dict
 class KnowledgeConfig(BaseModel):
    original_document_id: Optional[str] = None
    duplicate: bool = True
@ -105,6 +110,7 @@ class KnowledgeConfig(BaseModel):
    embedding_model: Optional[str] = None
    embedding_model_provider: Optional[str] = None
    name: Optional[str] = None
    metadata: Optional[MetaDataConfig] = None
 class SegmentUpdateArgs(BaseModel):
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@ -47,6 +47,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
      <Property name='text' type='string' key='text'>
        Document content
      </Property>
      <Property name='doc_type' type='string' key='doc_type'>
        Type of document (optional):
          - <code>book</code> Book
          - <code>web_page</code> Web page
          - <code>paper</code> Academic paper/article 
          - <code>social_media_post</code> Social media post
          - <code>wikipedia_entry</code> Wikipedia entry
          - <code>personal_document</code> Personal document
          - <code>business_document</code> Business document
          - <code>im_chat_log</code> Chat log
          - <code>synced_from_notion</code> Notion document
          - <code>synced_from_github</code> GitHub document
          - <code>others</code> Other document types
      </Property>
      <Property name='doc_metadata' type='object' key='doc_metadata'>
        Document metadata (required if doc_type is provided). Fields vary by doc_type:
          For <code>book</code>:
          - <code>title</code> Book title 
          - <code>language</code> Book language
          - <code>author</code> Book author
          - <code>publisher</code> Publisher name
          - <code>publication_date</code> Publication date
          - <code>isbn</code> ISBN number
          - <code>category</code> Book category
          For <code>web_page</code>:
          - <code>title</code> Page title
          - <code>url</code> Page URL
          - <code>language</code> Page language
          - <code>publish_date</code> Publish date
          - <code>author/publisher</code> Author or publisher
          - <code>topic/keywords</code> Topic or keywords
          - <code>description</code> Page description
          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
          For doc_type "others", any valid JSON object is accepted
      </Property>
      <Property name='indexing_technique' type='string' key='indexing_technique'>
        Index mode
          - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
@ -195,6 +233,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
          - <code>hierarchical_model</code> Parent-child mode
          - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
        - <code>doc_type</code> Type of document (optional)
          - <code>book</code> Book
            Document records a book or publication
          - <code>web_page</code> Web page 
            Document records web page content
          - <code>paper</code> Academic paper/article
            Document records academic paper or research article
          - <code>social_media_post</code> Social media post
            Content from social media posts
          - <code>wikipedia_entry</code> Wikipedia entry
            Content from Wikipedia entries
          - <code>personal_document</code> Personal document
            Documents related to personal content
          - <code>business_document</code> Business document
            Documents related to business content
          - <code>im_chat_log</code> Chat log
            Records of instant messaging chats
          - <code>synced_from_notion</code> Notion document
            Documents synchronized from Notion
          - <code>synced_from_github</code> GitHub document
            Documents synchronized from GitHub
          - <code>others</code> Other document types
            Other document types not listed above
        - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
          Fields vary by doc_type:
          For <code>book</code>:
          - <code>title</code> Book title
            Title of the book
          - <code>language</code> Book language
            Language of the book
          - <code>author</code> Book author
            Author of the book
          - <code>publisher</code> Publisher name
            Name of the publishing house
          - <code>publication_date</code> Publication date
            Date when the book was published
          - <code>isbn</code> ISBN number
            International Standard Book Number
          - <code>category</code> Book category
            Category or genre of the book
          For <code>web_page</code>:
          - <code>title</code> Page title
            Title of the web page
          - <code>url</code> Page URL
            URL address of the web page
          - <code>language</code> Page language
            Language of the web page
          - <code>publish_date</code> Publish date
            Date when the web page was published
          - <code>author/publisher</code> Author or publisher
            Author or publisher of the web page
          - <code>topic/keywords</code> Topic or keywords
            Topics or keywords of the web page
          - <code>description</code> Page description
            Description of the web page content
          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
          For doc_type "others", any valid JSON object is accepted
        - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
        - <code>process_rule</code> Processing rules
@ -307,6 +407,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
      <Property name='description' type='string' key='description'>
        Knowledge description (optional)
      </Property>
      <Property name='doc_type' type='string' key='doc_type'>
        Type of document (optional):
          - <code>book</code> Book
          - <code>web_page</code> Web page
          - <code>paper</code> Academic paper/article 
          - <code>social_media_post</code> Social media post
          - <code>wikipedia_entry</code> Wikipedia entry
          - <code>personal_document</code> Personal document
          - <code>business_document</code> Business document
          - <code>im_chat_log</code> Chat log
          - <code>synced_from_notion</code> Notion document
          - <code>synced_from_github</code> GitHub document
          - <code>others</code> Other document types
      </Property>
      <Property name='doc_metadata' type='object' key='doc_metadata'>
        Document metadata (required if doc_type is provided). Fields vary by doc_type:
          For <code>book</code>:
          - <code>title</code> Book title 
          - <code>language</code> Book language
          - <code>author</code> Book author
          - <code>publisher</code> Publisher name
          - <code>publication_date</code> Publication date
          - <code>isbn</code> ISBN number
          - <code>category</code> Book category
          For <code>web_page</code>:
          - <code>title</code> Page title
          - <code>url</code> Page URL
          - <code>language</code> Page language
          - <code>publish_date</code> Publish date
          - <code>author/publisher</code> Author or publisher
          - <code>topic/keywords</code> Topic or keywords
          - <code>description</code> Page description
          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
          For doc_type "others", any valid JSON object is accepted
      </Property>
      <Property name='indexing_technique' type='string' key='indexing_technique'>
        Index technique (optional)
          - <code>high_quality</code> High quality
@ -624,6 +762,67 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
              - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
              - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
              - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
            - <code>doc_type</code> Type of document (optional)
              - <code>book</code> Book
                Document records a book or publication
              - <code>web_page</code> Web page 
                Document records web page content
              - <code>paper</code> Academic paper/article
                Document records academic paper or research article
              - <code>social_media_post</code> Social media post
                Content from social media posts
              - <code>wikipedia_entry</code> Wikipedia entry
                Content from Wikipedia entries
              - <code>personal_document</code> Personal document
                Documents related to personal content
              - <code>business_document</code> Business document
                Documents related to business content
              - <code>im_chat_log</code> Chat log
                Records of instant messaging chats
              - <code>synced_from_notion</code> Notion document
                Documents synchronized from Notion
              - <code>synced_from_github</code> GitHub document
                Documents synchronized from GitHub
              - <code>others</code> Other document types
                Other document types not listed above
            - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
              Fields vary by doc_type:
              For <code>book</code>:
              - <code>title</code> Book title
                Title of the book
              - <code>language</code> Book language
                Language of the book
              - <code>author</code> Book author
                Author of the book
              - <code>publisher</code> Publisher name
                Name of the publishing house
              - <code>publication_date</code> Publication date
                Date when the book was published
              - <code>isbn</code> ISBN number
                International Standard Book Number
              - <code>category</code> Book category
                Category or genre of the book
              For <code>web_page</code>:
              - <code>title</code> Page title
                Title of the web page
              - <code>url</code> Page URL
                URL address of the web page
              - <code>language</code> Page language
                Language of the web page
              - <code>publish_date</code> Publish date
                Date when the web page was published
              - <code>author/publisher</code> Author or publisher
                Author or publisher of the web page
              - <code>topic/keywords</code> Topic or keywords
                Topics or keywords of the web page
              - <code>description</code> Page description
                Description of the web page content
              Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
              For doc_type "others", any valid JSON object is accepted
      </Property>
    </Properties>
  </Col>
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@ -47,6 +47,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
      <Property name='text' type='string' key='text'>
        文档内容
      </Property>
      <Property name='doc_type' type='string' key='doc_type'>
        文档类型（选填）
          - <code>book</code> 图书 Book
          - <code>web_page</code> 网页 Web page
          - <code>paper</code> 学术论文/文章 Academic paper/article 
          - <code>social_media_post</code> 社交媒体帖子 Social media post
          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
          - <code>personal_document</code> 个人文档 Personal document
          - <code>business_document</code> 商业文档 Business document
          - <code>im_chat_log</code> 即时通讯记录 Chat log
          - <code>synced_from_notion</code> Notion同步文档 Notion document
          - <code>synced_from_github</code> GitHub同步文档 GitHub document
          - <code>others</code> 其他文档类型 Other document types
      </Property>
      <Property name='doc_metadata' type='object' key='doc_metadata'>
        文档元数据（如提供文档类型则必填）。字段因文档类型而异：
          针对图书 For <code>book</code>:
          - <code>title</code> 书名 Book title 
          - <code>language</code> 图书语言 Book language
          - <code>author</code> 作者 Book author
          - <code>publisher</code> 出版社 Publisher name
          - <code>publication_date</code> 出版日期 Publication date
          - <code>isbn</code> ISBN号码 ISBN number
          - <code>category</code> 图书分类 Book category
          针对网页 For <code>web_page</code>:
          - <code>title</code> 页面标题 Page title
          - <code>url</code> 页面网址 Page URL
          - <code>language</code> 页面语言 Page language
          - <code>publish_date</code> 发布日期 Publish date
          - <code>author/publisher</code> 作者/发布者 Author or publisher
          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
          - <code>description</code> 页面描述 Page description
          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
          针对"其他"类型文档，接受任何有效的JSON对象
      </Property>
      <Property name='indexing_technique' type='string' key='indexing_technique'>
        索引方式
          - <code>high_quality</code> 高质量：使用  embedding 模型进行嵌入，构建为向量数据库索引
@ -194,6 +234,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
          - <code>text_model</code> text 文档直接 embedding，经济模式默认为该模式
          - <code>hierarchical_model</code> parent-child 模式
          - <code>qa_model</code> Q&A 模式：为分片文档生成 Q&A 对，然后对问题进行 embedding
        - <code>doc_type</code> 文档类型（选填）Type of document (optional)
          - <code>book</code> 图书
            文档记录一本书籍或出版物
          - <code>web_page</code> 网页
            网页内容的文档记录
          - <code>paper</code> 学术论文/文章
            学术论文或研究文章的记录
          - <code>social_media_post</code> 社交媒体帖子
            社交媒体上的帖子内容
          - <code>wikipedia_entry</code> 维基百科条目
            维基百科的词条内容
          - <code>personal_document</code> 个人文档
            个人相关的文档记录
          - <code>business_document</code> 商业文档
            商业相关的文档记录
          - <code>im_chat_log</code> 即时通讯记录
            即时通讯的聊天记录
          - <code>synced_from_notion</code> Notion同步文档
            从Notion同步的文档内容
          - <code>synced_from_github</code> GitHub同步文档
            从GitHub同步的文档内容
          - <code>others</code> 其他文档类型
            其他未列出的文档类型
        - <code>doc_metadata</code> 文档元数据（如提供文档类型则必填
          字段因文档类型而异
          针对图书类型 For <code>book</code>:
          - <code>title</code> 书名
            书籍的标题
          - <code>language</code> 图书语言
            书籍的语言
          - <code>author</code> 作者
            书籍的作者
          - <code>publisher</code> 出版社
            出版社的名称
          - <code>publication_date</code> 出版日期
            书籍的出版日期
          - <code>isbn</code> ISBN号码
            书籍的ISBN编号
          - <code>category</code> 图书分类
            书籍的分类类别
          针对网页类型 For <code>web_page</code>:
          - <code>title</code> 页面标题
            网页的标题
          - <code>url</code> 页面网址
            网页的URL地址
          - <code>language</code> 页面语言
            网页的语言
          - <code>publish_date</code> 发布日期
            网页的发布日期
          - <code>author/publisher</code> 作者/发布者
            网页的作者或发布者
          - <code>topic/keywords</code> 主题/关键词
            网页的主题或关键词
          - <code>description</code> 页面描述
            网页的描述信息
          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
          针对"其他"类型文档，接受任何有效的JSON对象
        - <code>doc_language</code> 在 Q&A 模式下，指定文档的语言，例如：<code>English</code>、<code>Chinese</code>
@ -504,6 +606,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
      <Property name='text' type='string' key='text'>
        文档内容（选填）
      </Property>
      <Property name='doc_type' type='string' key='doc_type'>
        文档类型（选填）
          - <code>book</code> 图书 Book
          - <code>web_page</code> 网页 Web page
          - <code>paper</code> 学术论文/文章 Academic paper/article 
          - <code>social_media_post</code> 社交媒体帖子 Social media post
          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
          - <code>personal_document</code> 个人文档 Personal document
          - <code>business_document</code> 商业文档 Business document
          - <code>im_chat_log</code> 即时通讯记录 Chat log
          - <code>synced_from_notion</code> Notion同步文档 Notion document
          - <code>synced_from_github</code> GitHub同步文档 GitHub document
          - <code>others</code> 其他文档类型 Other document types
      </Property>
      <Property name='doc_metadata' type='object' key='doc_metadata'>
        文档元数据（如提供文档类型则必填）。字段因文档类型而异：
          针对图书 For <code>book</code>:
          - <code>title</code> 书名 Book title 
          - <code>language</code> 图书语言 Book language
          - <code>author</code> 作者 Book author
          - <code>publisher</code> 出版社 Publisher name
          - <code>publication_date</code> 出版日期 Publication date
          - <code>isbn</code> ISBN号码 ISBN number
          - <code>category</code> 图书分类 Book category
          针对网页 For <code>web_page</code>:
          - <code>title</code> 页面标题 Page title
          - <code>url</code> 页面网址 Page URL
          - <code>language</code> 页面语言 Page language
          - <code>publish_date</code> 发布日期 Publish date
          - <code>author/publisher</code> 作者/发布者 Author or publisher
          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
          - <code>description</code> 页面描述 Page description
          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
          针对"其他"类型文档，接受任何有效的JSON对象
      </Property>
      <Property name='process_rule' type='object' key='process_rule'>
        处理规则（选填）
          - <code>mode</code> (string) 清洗、分段模式 ，automatic 自动 / custom 自定义
@ -624,6 +766,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
              - <code>separator</code> 分段标识符，目前仅允许设置一个分隔符。默认为 <code>***</code>
              - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
              - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时，段与段之间存在一定的重叠部分（选填）
            - <code>doc_type</code> 文档类型（选填）Type of document (optional)
              - <code>book</code> 图书
                文档记录一本书籍或出版物
              - <code>web_page</code> 网页
                网页内容的文档记录
              - <code>paper</code> 学术论文/文章
                学术论文或研究文章的记录
              - <code>social_media_post</code> 社交媒体帖子
                社交媒体上的帖子内容
              - <code>wikipedia_entry</code> 维基百科条目
                维基百科的词条内容
              - <code>personal_document</code> 个人文档
                个人相关的文档记录
              - <code>business_document</code> 商业文档
                商业相关的文档记录
              - <code>im_chat_log</code> 即时通讯记录
                即时通讯的聊天记录
              - <code>synced_from_notion</code> Notion同步文档
                从Notion同步的文档内容
              - <code>synced_from_github</code> GitHub同步文档
                从GitHub同步的文档内容
              - <code>others</code> 其他文档类型
                其他未列出的文档类型
            - <code>doc_metadata</code> 文档元数据（如提供文档类型则必填
              字段因文档类型而异
              针对图书类型 For <code>book</code>:
              - <code>title</code> 书名
                书籍的标题
              - <code>language</code> 图书语言
                书籍的语言
              - <code>author</code> 作者
                书籍的作者
              - <code>publisher</code> 出版社
                出版社的名称
              - <code>publication_date</code> 出版日期
                书籍的出版日期
              - <code>isbn</code> ISBN号码
                书籍的ISBN编号
              - <code>category</code> 图书分类
                书籍的分类类别
              针对网页类型 For <code>web_page</code>:
              - <code>title</code> 页面标题
                网页的标题
              - <code>url</code> 页面网址
                网页的URL地址
              - <code>language</code> 页面语言
                网页的语言
              - <code>publish_date</code> 发布日期
                网页的发布日期
              - <code>author/publisher</code> 作者/发布者
                网页的作者或发布者
              - <code>topic/keywords</code> 主题/关键词
                网页的主题或关键词
              - <code>description</code> 页面描述
                网页的描述信息
              请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
              针对"其他"类型文档，接受任何有效的JSON对象
      </Property>
    </Properties>
  </Col>