mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 18:39:06 +08:00
feat (document_extractor): support .properties file (#18969)
This commit is contained in:
parent
f86e2edc54
commit
5de01c1444
@ -16,11 +16,25 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
|
|||||||
|
|
||||||
|
|
||||||
if dify_config.ETL_TYPE == "Unstructured":
|
if dify_config.ETL_TYPE == "Unstructured":
|
||||||
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt"]
|
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt", "properties"]
|
||||||
DOCUMENT_EXTENSIONS.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
|
DOCUMENT_EXTENSIONS.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
|
||||||
if dify_config.UNSTRUCTURED_API_URL:
|
if dify_config.UNSTRUCTURED_API_URL:
|
||||||
DOCUMENT_EXTENSIONS.append("ppt")
|
DOCUMENT_EXTENSIONS.append("ppt")
|
||||||
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
||||||
else:
|
else:
|
||||||
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv", "vtt"]
|
DOCUMENT_EXTENSIONS = [
|
||||||
|
"txt",
|
||||||
|
"markdown",
|
||||||
|
"md",
|
||||||
|
"mdx",
|
||||||
|
"pdf",
|
||||||
|
"html",
|
||||||
|
"htm",
|
||||||
|
"xlsx",
|
||||||
|
"xls",
|
||||||
|
"docx",
|
||||||
|
"csv",
|
||||||
|
"vtt",
|
||||||
|
"properties",
|
||||||
|
]
|
||||||
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
||||||
|
@ -135,6 +135,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
|
|||||||
return _extract_text_from_yaml(file_content)
|
return _extract_text_from_yaml(file_content)
|
||||||
case "text/vtt":
|
case "text/vtt":
|
||||||
return _extract_text_from_vtt(file_content)
|
return _extract_text_from_vtt(file_content)
|
||||||
|
case "text/properties":
|
||||||
|
return _extract_text_from_properties(file_content)
|
||||||
case _:
|
case _:
|
||||||
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
||||||
|
|
||||||
@ -170,6 +172,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
|
|||||||
return _extract_text_from_msg(file_content)
|
return _extract_text_from_msg(file_content)
|
||||||
case ".vtt":
|
case ".vtt":
|
||||||
return _extract_text_from_vtt(file_content)
|
return _extract_text_from_vtt(file_content)
|
||||||
|
case ".properties":
|
||||||
|
return _extract_text_from_properties(file_content)
|
||||||
case _:
|
case _:
|
||||||
raise UnsupportedFileTypeError(f"Unsupported Extension Type: {file_extension}")
|
raise UnsupportedFileTypeError(f"Unsupported Extension Type: {file_extension}")
|
||||||
|
|
||||||
@ -506,3 +510,29 @@ def _extract_text_from_vtt(vtt_bytes: bytes) -> str:
|
|||||||
# Return the result in the specified format: Speaker "text" style
|
# Return the result in the specified format: Speaker "text" style
|
||||||
formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results]
|
formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results]
|
||||||
return "\n".join(formatted)
|
return "\n".join(formatted)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_from_properties(file_content: bytes) -> str:
|
||||||
|
try:
|
||||||
|
text = _extract_text_from_plain_text(file_content)
|
||||||
|
lines = text.splitlines()
|
||||||
|
result = []
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
# Preserve comments and empty lines
|
||||||
|
if not line or line.startswith("#") or line.startswith("!"):
|
||||||
|
result.append(line)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "=" in line:
|
||||||
|
key, value = line.split("=", 1)
|
||||||
|
elif ":" in line:
|
||||||
|
key, value = line.split(":", 1)
|
||||||
|
else:
|
||||||
|
key, value = line, ""
|
||||||
|
|
||||||
|
result.append(f"{key.strip()}: {value.strip()}")
|
||||||
|
|
||||||
|
return "\n".join(result)
|
||||||
|
except Exception as e:
|
||||||
|
raise TextExtractionError(f"Failed to extract text from properties file: {str(e)}") from e
|
||||||
|
Loading…
x
Reference in New Issue
Block a user