diff --git a/api/constants/__init__.py b/api/constants/__init__.py index 5b73046231..a84de0a451 100644 --- a/api/constants/__init__.py +++ b/api/constants/__init__.py @@ -16,11 +16,25 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS]) if dify_config.ETL_TYPE == "Unstructured": - DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt"] + DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt", "properties"] DOCUMENT_EXTENSIONS.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub")) if dify_config.UNSTRUCTURED_API_URL: DOCUMENT_EXTENSIONS.append("ppt") DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) else: - DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv", "vtt"] + DOCUMENT_EXTENSIONS = [ + "txt", + "markdown", + "md", + "mdx", + "pdf", + "html", + "htm", + "xlsx", + "xls", + "docx", + "csv", + "vtt", + "properties", + ] DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 4d4d46c694..f14dd5d753 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -135,6 +135,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str: return _extract_text_from_yaml(file_content) case "text/vtt": return _extract_text_from_vtt(file_content) + case "text/properties": + return _extract_text_from_properties(file_content) case _: raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}") @@ -170,6 +172,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) return _extract_text_from_msg(file_content) case ".vtt": return _extract_text_from_vtt(file_content) + case ".properties": + return _extract_text_from_properties(file_content) case _: raise UnsupportedFileTypeError(f"Unsupported Extension Type: {file_extension}") @@ -506,3 +510,29 @@ def _extract_text_from_vtt(vtt_bytes: bytes) -> str: # Return the result in the specified format: Speaker "text" style formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results] return "\n".join(formatted) + + +def _extract_text_from_properties(file_content: bytes) -> str: + try: + text = _extract_text_from_plain_text(file_content) + lines = text.splitlines() + result = [] + for line in lines: + line = line.strip() + # Preserve comments and empty lines + if not line or line.startswith("#") or line.startswith("!"): + result.append(line) + continue + + if "=" in line: + key, value = line.split("=", 1) + elif ":" in line: + key, value = line.split(":", 1) + else: + key, value = line, "" + + result.append(f"{key.strip()}: {value.strip()}") + + return "\n".join(result) + except Exception as e: + raise TextExtractionError(f"Failed to extract text from properties file: {str(e)}") from e