Fix/create document by api with metadata (#16307)

Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
This commit is contained in:
Jyong 2025-03-20 14:33:32 +08:00 committed by GitHub
parent c1f3d968bf
commit 2c9af712a2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 75 additions and 527 deletions

View File

@ -20,7 +20,7 @@ from libs.helper import email as email_validate
from libs.password import hash_password, password_pattern, valid_password
from libs.rsa import generate_key_pair
from models import Tenant
from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
from models.dataset import Document as DatasetDocument
from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
from models.provider import Provider, ProviderModel
@ -483,14 +483,11 @@ def convert_to_agent_apps():
click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
@click.command("add-qdrant-index", help="Add Qdrant index.")
@click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
def add_qdrant_doc_id_index(field: str):
click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
vector_type = dify_config.VECTOR_STORE
if vector_type != "qdrant":
click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
return
def add_qdrant_index(field: str):
click.echo(click.style("Starting Qdrant index creation.", fg="green"))
create_count = 0
try:
@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str):
click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
@click.command("old-metadata-migration", help="Old metadata migration.")
def old_metadata_migration():
"""
Old metadata migration.
"""
click.echo(click.style("Starting old metadata migration.", fg="green"))
page = 1
while True:
try:
documents = (
DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
.order_by(DatasetDocument.created_at.desc())
.paginate(page=page, per_page=50)
)
except NotFound:
break
if not documents:
break
for document in documents:
if document.doc_metadata:
doc_metadata = document.doc_metadata
for key, value in doc_metadata.items():
dataset_metadata = (
db.session.query(DatasetMetadata)
.filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
.first()
)
if not dataset_metadata:
dataset_metadata = DatasetMetadata(
tenant_id=document.tenant_id,
dataset_id=document.dataset_id,
name=key,
type="string",
created_by=document.created_by,
)
db.session.add(dataset_metadata)
db.session.flush()
dataset_metadata_binding = DatasetMetadataBinding(
tenant_id=document.tenant_id,
dataset_id=document.dataset_id,
metadata_id=dataset_metadata.id,
document_id=document.id,
created_by=document.created_by,
)
db.session.add(dataset_metadata_binding)
else:
dataset_metadata_binding = DatasetMetadataBinding.query.filter(
DatasetMetadataBinding.dataset_id == document.dataset_id,
DatasetMetadataBinding.document_id == document.id,
DatasetMetadataBinding.metadata_id == dataset_metadata.id,
).first()
if not dataset_metadata_binding:
dataset_metadata_binding = DatasetMetadataBinding(
tenant_id=document.tenant_id,
dataset_id=document.dataset_id,
metadata_id=dataset_metadata.id,
document_id=document.id,
created_by=document.created_by,
)
db.session.add(dataset_metadata_binding)
db.session.commit()
page += 1
click.echo(click.style("Old metadata migration completed.", fg="green"))
@click.command("create-tenant", help="Create account and tenant.")
@click.option("--email", prompt=True, help="Tenant account email.")
@click.option("--name", prompt=True, help="Workspace name.")

View File

@ -18,7 +18,6 @@ from controllers.service_api.app.error import (
from controllers.service_api.dataset.error import (
ArchivedDocumentImmutableError,
DocumentIndexingError,
InvalidMetadataError,
)
from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
from core.errors.error import ProviderTokenNotInitError
@ -51,8 +50,6 @@ class DocumentAddByTextApi(DatasetApiResource):
"indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args()
dataset_id = str(dataset_id)
@ -65,28 +62,6 @@ class DocumentAddByTextApi(DatasetApiResource):
if not dataset.indexing_technique and not args["indexing_technique"]:
raise ValueError("indexing_technique is required.")
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
text = args.get("text")
name = args.get("name")
if text is None or name is None:
@ -133,8 +108,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args()
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@ -146,29 +119,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
# indexing_technique is already set in dataset since this is an update
args["indexing_technique"] = dataset.indexing_technique
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
if args["text"]:
text = args.get("text")
name = args.get("name")
@ -216,29 +166,6 @@ class DocumentAddByFileApi(DatasetApiResource):
if "doc_language" not in args:
args["doc_language"] = "English"
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
# get dataset info
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@ -306,29 +233,6 @@ class DocumentUpdateByFileApi(DatasetApiResource):
if "doc_language" not in args:
args["doc_language"] = "English"
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
# get dataset info
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)

View File

@ -3,7 +3,7 @@ from dify_app import DifyApp
def init_app(app: DifyApp):
from commands import (
add_qdrant_doc_id_index,
add_qdrant_index,
convert_to_agent_apps,
create_tenant,
extract_plugins,
@ -11,6 +11,7 @@ def init_app(app: DifyApp):
fix_app_site_missing,
install_plugins,
migrate_data_for_plugin,
old_metadata_migration,
reset_email,
reset_encrypt_key_pair,
reset_password,
@ -24,7 +25,7 @@ def init_app(app: DifyApp):
reset_encrypt_key_pair,
vdb_migrate,
convert_to_agent_apps,
add_qdrant_doc_id_index,
add_qdrant_index,
create_tenant,
upgrade_db,
fix_app_site_missing,
@ -32,6 +33,7 @@ def init_app(app: DifyApp):
extract_plugins,
extract_unique_plugins,
install_plugins,
old_metadata_migration,
]
for cmd in cmds_to_register:
app.cli.add_command(cmd)

View File

@ -46,7 +46,6 @@ from models.source import DataSourceOauthBinding
from services.entities.knowledge_entities.knowledge_entities import (
ChildChunkUpdateArgs,
KnowledgeConfig,
MetaDataConfig,
RerankingModel,
RetrievalModel,
SegmentUpdateArgs,
@ -999,9 +998,6 @@ class DocumentService:
document.data_source_info = json.dumps(data_source_info)
document.batch = batch
document.indexing_status = "waiting"
if knowledge_config.metadata:
document.doc_type = knowledge_config.metadata.doc_type
document.metadata = knowledge_config.metadata.doc_metadata
db.session.add(document)
documents.append(document)
duplicate_document_ids.append(document.id)
@ -1018,7 +1014,6 @@ class DocumentService:
account,
file_name,
batch,
knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@ -1076,7 +1071,6 @@ class DocumentService:
account,
truncated_page_name,
batch,
knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@ -1117,7 +1111,6 @@ class DocumentService:
account,
document_name,
batch,
knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@ -1155,7 +1148,6 @@ class DocumentService:
account: Account,
name: str,
batch: str,
metadata: Optional[MetaDataConfig] = None,
):
document = Document(
tenant_id=dataset.tenant_id,
@ -1180,9 +1172,6 @@ class DocumentService:
BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
BuiltInField.source: data_source_type,
}
if metadata is not None:
doc_metadata.update(metadata.doc_metadata)
document.doc_type = metadata.doc_type
if doc_metadata:
document.doc_metadata = doc_metadata
return document
@ -1297,10 +1286,6 @@ class DocumentService:
# update document name
if document_data.name:
document.name = document_data.name
# update doc_type and doc_metadata if provided
if document_data.metadata is not None:
document.doc_metadata = document_data.metadata.doc_metadata
document.doc_type = document_data.metadata.doc_type
# update document to be waiting
document.indexing_status = "waiting"
document.completed_at = None

View File

@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel):
embedding_model: Optional[str] = None
embedding_model_provider: Optional[str] = None
name: Optional[str] = None
metadata: Optional[MetaDataConfig] = None
class SegmentUpdateArgs(BaseModel):

View File

@ -47,44 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<Property name='text' type='string' key='text'>
Document content
</Property>
<Property name='doc_type' type='string' key='doc_type'>
Type of document (optional):
- <code>book</code> Book
- <code>web_page</code> Web page
- <code>paper</code> Academic paper/article
- <code>social_media_post</code> Social media post
- <code>wikipedia_entry</code> Wikipedia entry
- <code>personal_document</code> Personal document
- <code>business_document</code> Business document
- <code>im_chat_log</code> Chat log
- <code>synced_from_notion</code> Notion document
- <code>synced_from_github</code> GitHub document
- <code>others</code> Other document types
</Property>
<Property name='doc_metadata' type='object' key='doc_metadata'>
Document metadata (required if doc_type is provided). Fields vary by doc_type:
For <code>book</code>:
- <code>title</code> Book title
- <code>language</code> Book language
- <code>author</code> Book author
- <code>publisher</code> Publisher name
- <code>publication_date</code> Publication date
- <code>isbn</code> ISBN number
- <code>category</code> Book category
For <code>web_page</code>:
- <code>title</code> Page title
- <code>url</code> Page URL
- <code>language</code> Page language
- <code>publish_date</code> Publish date
- <code>author/publisher</code> Author or publisher
- <code>topic/keywords</code> Topic or keywords
- <code>description</code> Page description
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
For doc_type "others", any valid JSON object is accepted
</Property>
<Property name='indexing_technique' type='string' key='indexing_technique'>
Index mode
- <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
@ -233,68 +195,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- <code>hierarchical_model</code> Parent-child mode
- <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
- <code>doc_type</code> Type of document (optional)
- <code>book</code> Book
Document records a book or publication
- <code>web_page</code> Web page
Document records web page content
- <code>paper</code> Academic paper/article
Document records academic paper or research article
- <code>social_media_post</code> Social media post
Content from social media posts
- <code>wikipedia_entry</code> Wikipedia entry
Content from Wikipedia entries
- <code>personal_document</code> Personal document
Documents related to personal content
- <code>business_document</code> Business document
Documents related to business content
- <code>im_chat_log</code> Chat log
Records of instant messaging chats
- <code>synced_from_notion</code> Notion document
Documents synchronized from Notion
- <code>synced_from_github</code> GitHub document
Documents synchronized from GitHub
- <code>others</code> Other document types
Other document types not listed above
- <code>doc_metadata</code> Document metadata (required if doc_type is provided)
Fields vary by doc_type:
For <code>book</code>:
- <code>title</code> Book title
Title of the book
- <code>language</code> Book language
Language of the book
- <code>author</code> Book author
Author of the book
- <code>publisher</code> Publisher name
Name of the publishing house
- <code>publication_date</code> Publication date
Date when the book was published
- <code>isbn</code> ISBN number
International Standard Book Number
- <code>category</code> Book category
Category or genre of the book
For <code>web_page</code>:
- <code>title</code> Page title
Title of the web page
- <code>url</code> Page URL
URL address of the web page
- <code>language</code> Page language
Language of the web page
- <code>publish_date</code> Publish date
Date when the web page was published
- <code>author/publisher</code> Author or publisher
Author or publisher of the web page
- <code>topic/keywords</code> Topic or keywords
Topics or keywords of the web page
- <code>description</code> Page description
Description of the web page content
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
For doc_type "others", any valid JSON object is accepted
- <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
- <code>process_rule</code> Processing rules
@ -407,44 +307,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<Property name='description' type='string' key='description'>
Knowledge description (optional)
</Property>
<Property name='doc_type' type='string' key='doc_type'>
Type of document (optional):
- <code>book</code> Book
- <code>web_page</code> Web page
- <code>paper</code> Academic paper/article
- <code>social_media_post</code> Social media post
- <code>wikipedia_entry</code> Wikipedia entry
- <code>personal_document</code> Personal document
- <code>business_document</code> Business document
- <code>im_chat_log</code> Chat log
- <code>synced_from_notion</code> Notion document
- <code>synced_from_github</code> GitHub document
- <code>others</code> Other document types
</Property>
<Property name='doc_metadata' type='object' key='doc_metadata'>
Document metadata (required if doc_type is provided). Fields vary by doc_type:
For <code>book</code>:
- <code>title</code> Book title
- <code>language</code> Book language
- <code>author</code> Book author
- <code>publisher</code> Publisher name
- <code>publication_date</code> Publication date
- <code>isbn</code> ISBN number
- <code>category</code> Book category
For <code>web_page</code>:
- <code>title</code> Page title
- <code>url</code> Page URL
- <code>language</code> Page language
- <code>publish_date</code> Publish date
- <code>author/publisher</code> Author or publisher
- <code>topic/keywords</code> Topic or keywords
- <code>description</code> Page description
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
For doc_type "others", any valid JSON object is accepted
</Property>
<Property name='indexing_technique' type='string' key='indexing_technique'>
Index technique (optional)
- <code>high_quality</code> High quality
@ -762,67 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
- <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
- <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
- <code>doc_type</code> Type of document (optional)
- <code>book</code> Book
Document records a book or publication
- <code>web_page</code> Web page
Document records web page content
- <code>paper</code> Academic paper/article
Document records academic paper or research article
- <code>social_media_post</code> Social media post
Content from social media posts
- <code>wikipedia_entry</code> Wikipedia entry
Content from Wikipedia entries
- <code>personal_document</code> Personal document
Documents related to personal content
- <code>business_document</code> Business document
Documents related to business content
- <code>im_chat_log</code> Chat log
Records of instant messaging chats
- <code>synced_from_notion</code> Notion document
Documents synchronized from Notion
- <code>synced_from_github</code> GitHub document
Documents synchronized from GitHub
- <code>others</code> Other document types
Other document types not listed above
- <code>doc_metadata</code> Document metadata (required if doc_type is provided)
Fields vary by doc_type:
For <code>book</code>:
- <code>title</code> Book title
Title of the book
- <code>language</code> Book language
Language of the book
- <code>author</code> Book author
Author of the book
- <code>publisher</code> Publisher name
Name of the publishing house
- <code>publication_date</code> Publication date
Date when the book was published
- <code>isbn</code> ISBN number
International Standard Book Number
- <code>category</code> Book category
Category or genre of the book
For <code>web_page</code>:
- <code>title</code> Page title
Title of the web page
- <code>url</code> Page URL
URL address of the web page
- <code>language</code> Page language
Language of the web page
- <code>publish_date</code> Publish date
Date when the web page was published
- <code>author/publisher</code> Author or publisher
Author or publisher of the web page
- <code>topic/keywords</code> Topic or keywords
Topics or keywords of the web page
- <code>description</code> Page description
Description of the web page content
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
For doc_type "others", any valid JSON object is accepted
</Property>
</Properties>
</Col>
@ -1528,7 +1329,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
"id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
"data_source_type": "upload_file",
"name": "readme.txt",
"doc_type": null
}
},
"score": 3.730463140527718e-05,

View File

@ -47,46 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<Property name='text' type='string' key='text'>
文档内容
</Property>
<Property name='doc_type' type='string' key='doc_type'>
文档类型(选填)
- <code>book</code> 图书 Book
- <code>web_page</code> 网页 Web page
- <code>paper</code> 学术论文/文章 Academic paper/article
- <code>social_media_post</code> 社交媒体帖子 Social media post
- <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
- <code>personal_document</code> 个人文档 Personal document
- <code>business_document</code> 商业文档 Business document
- <code>im_chat_log</code> 即时通讯记录 Chat log
- <code>synced_from_notion</code> Notion同步文档 Notion document
- <code>synced_from_github</code> GitHub同步文档 GitHub document
- <code>others</code> 其他文档类型 Other document types
</Property>
<Property name='doc_metadata' type='object' key='doc_metadata'>
文档元数据(如提供文档类型则必填)。字段因文档类型而异:
针对图书 For <code>book</code>:
- <code>title</code> 书名 Book title
- <code>language</code> 图书语言 Book language
- <code>author</code> 作者 Book author
- <code>publisher</code> 出版社 Publisher name
- <code>publication_date</code> 出版日期 Publication date
- <code>isbn</code> ISBN号码 ISBN number
- <code>category</code> 图书分类 Book category
针对网页 For <code>web_page</code>:
- <code>title</code> 页面标题 Page title
- <code>url</code> 页面网址 Page URL
- <code>language</code> 页面语言 Page language
- <code>publish_date</code> 发布日期 Publish date
- <code>author/publisher</code> 作者/发布者 Author or publisher
- <code>topic/keywords</code> 主题/关键词 Topic or keywords
- <code>description</code> 页面描述 Page description
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
针对"其他"类型文档接受任何有效的JSON对象
</Property>
<Property name='indexing_technique' type='string' key='indexing_technique'>
索引方式
- <code>high_quality</code> 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引
@ -234,68 +194,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- <code>text_model</code> text 文档直接 embedding经济模式默认为该模式
- <code>hierarchical_model</code> parent-child 模式
- <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
- <code>doc_type</code> 文档类型选填Type of document (optional)
- <code>book</code> 图书
文档记录一本书籍或出版物
- <code>web_page</code> 网页
网页内容的文档记录
- <code>paper</code> 学术论文/文章
学术论文或研究文章的记录
- <code>social_media_post</code> 社交媒体帖子
社交媒体上的帖子内容
- <code>wikipedia_entry</code> 维基百科条目
维基百科的词条内容
- <code>personal_document</code> 个人文档
个人相关的文档记录
- <code>business_document</code> 商业文档
商业相关的文档记录
- <code>im_chat_log</code> 即时通讯记录
即时通讯的聊天记录
- <code>synced_from_notion</code> Notion同步文档
从Notion同步的文档内容
- <code>synced_from_github</code> GitHub同步文档
从GitHub同步的文档内容
- <code>others</code> 其他文档类型
其他未列出的文档类型
- <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
字段因文档类型而异
针对图书类型 For <code>book</code>:
- <code>title</code> 书名
书籍的标题
- <code>language</code> 图书语言
书籍的语言
- <code>author</code> 作者
书籍的作者
- <code>publisher</code> 出版社
出版社的名称
- <code>publication_date</code> 出版日期
书籍的出版日期
- <code>isbn</code> ISBN号码
书籍的ISBN编号
- <code>category</code> 图书分类
书籍的分类类别
针对网页类型 For <code>web_page</code>:
- <code>title</code> 页面标题
网页的标题
- <code>url</code> 页面网址
网页的URL地址
- <code>language</code> 页面语言
网页的语言
- <code>publish_date</code> 发布日期
网页的发布日期
- <code>author/publisher</code> 作者/发布者
网页的作者或发布者
- <code>topic/keywords</code> 主题/关键词
网页的主题或关键词
- <code>description</code> 页面描述
网页的描述信息
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
针对"其他"类型文档接受任何有效的JSON对象
- <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code>
@ -606,46 +504,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<Property name='text' type='string' key='text'>
文档内容(选填)
</Property>
<Property name='doc_type' type='string' key='doc_type'>
文档类型(选填)
- <code>book</code> 图书 Book
- <code>web_page</code> 网页 Web page
- <code>paper</code> 学术论文/文章 Academic paper/article
- <code>social_media_post</code> 社交媒体帖子 Social media post
- <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
- <code>personal_document</code> 个人文档 Personal document
- <code>business_document</code> 商业文档 Business document
- <code>im_chat_log</code> 即时通讯记录 Chat log
- <code>synced_from_notion</code> Notion同步文档 Notion document
- <code>synced_from_github</code> GitHub同步文档 GitHub document
- <code>others</code> 其他文档类型 Other document types
</Property>
<Property name='doc_metadata' type='object' key='doc_metadata'>
文档元数据(如提供文档类型则必填)。字段因文档类型而异:
针对图书 For <code>book</code>:
- <code>title</code> 书名 Book title
- <code>language</code> 图书语言 Book language
- <code>author</code> 作者 Book author
- <code>publisher</code> 出版社 Publisher name
- <code>publication_date</code> 出版日期 Publication date
- <code>isbn</code> ISBN号码 ISBN number
- <code>category</code> 图书分类 Book category
针对网页 For <code>web_page</code>:
- <code>title</code> 页面标题 Page title
- <code>url</code> 页面网址 Page URL
- <code>language</code> 页面语言 Page language
- <code>publish_date</code> 发布日期 Publish date
- <code>author/publisher</code> 作者/发布者 Author or publisher
- <code>topic/keywords</code> 主题/关键词 Topic or keywords
- <code>description</code> 页面描述 Page description
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
针对"其他"类型文档接受任何有效的JSON对象
</Property>
<Property name='process_rule' type='object' key='process_rule'>
处理规则(选填)
- <code>mode</code> (string) 清洗、分段模式 automatic 自动 / custom 自定义
@ -766,68 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code>
- <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
- <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填)
- <code>doc_type</code> 文档类型选填Type of document (optional)
- <code>book</code> 图书
文档记录一本书籍或出版物
- <code>web_page</code> 网页
网页内容的文档记录
- <code>paper</code> 学术论文/文章
学术论文或研究文章的记录
- <code>social_media_post</code> 社交媒体帖子
社交媒体上的帖子内容
- <code>wikipedia_entry</code> 维基百科条目
维基百科的词条内容
- <code>personal_document</code> 个人文档
个人相关的文档记录
- <code>business_document</code> 商业文档
商业相关的文档记录
- <code>im_chat_log</code> 即时通讯记录
即时通讯的聊天记录
- <code>synced_from_notion</code> Notion同步文档
从Notion同步的文档内容
- <code>synced_from_github</code> GitHub同步文档
从GitHub同步的文档内容
- <code>others</code> 其他文档类型
其他未列出的文档类型
- <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
字段因文档类型而异
针对图书类型 For <code>book</code>:
- <code>title</code> 书名
书籍的标题
- <code>language</code> 图书语言
书籍的语言
- <code>author</code> 作者
书籍的作者
- <code>publisher</code> 出版社
出版社的名称
- <code>publication_date</code> 出版日期
书籍的出版日期
- <code>isbn</code> ISBN号码
书籍的ISBN编号
- <code>category</code> 图书分类
书籍的分类类别
针对网页类型 For <code>web_page</code>:
- <code>title</code> 页面标题
网页的标题
- <code>url</code> 页面网址
网页的URL地址
- <code>language</code> 页面语言
网页的语言
- <code>publish_date</code> 发布日期
网页的发布日期
- <code>author/publisher</code> 作者/发布者
网页的作者或发布者
- <code>topic/keywords</code> 主题/关键词
网页的主题或关键词
- <code>description</code> 页面描述
网页的描述信息
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
针对"其他"类型文档接受任何有效的JSON对象
</Property>
</Properties>
</Col>
@ -1534,7 +1330,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
"id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
"data_source_type": "upload_file",
"name": "readme.txt",
"doc_type": null
}
},
"score": 3.730463140527718e-05,