From 9c9d3d7bd0c4039d38d31f1326582ce54a17507c Mon Sep 17 00:00:00 2001 From: -LAN- Date: Tue, 27 May 2025 13:27:46 +0800 Subject: [PATCH] feat: document extractor chardet encoding (#20269) Signed-off-by: -LAN- --- .../workflow/nodes/document_extractor/node.py | 72 ++++++++++++++++--- .../nodes/test_document_extractor_node.py | 2 +- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 84abac7b15..65b5623a2e 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -7,6 +7,7 @@ import tempfile from collections.abc import Mapping, Sequence from typing import Any, cast +import chardet import docx import pandas as pd import pypandoc # type: ignore @@ -180,26 +181,64 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) def _extract_text_from_plain_text(file_content: bytes) -> str: try: - return file_content.decode("utf-8", "ignore") - except UnicodeDecodeError as e: - raise TextExtractionError("Failed to decode plain text file") from e + # Detect encoding using chardet + result = chardet.detect(file_content) + encoding = result["encoding"] + + # Fallback to utf-8 if detection fails + if not encoding: + encoding = "utf-8" + + return file_content.decode(encoding, errors="ignore") + except (UnicodeDecodeError, LookupError) as e: + # If decoding fails, try with utf-8 as last resort + try: + return file_content.decode("utf-8", errors="ignore") + except UnicodeDecodeError: + raise TextExtractionError(f"Failed to decode plain text file: {e}") from e def _extract_text_from_json(file_content: bytes) -> str: try: - json_data = json.loads(file_content.decode("utf-8", "ignore")) + # Detect encoding using chardet + result = chardet.detect(file_content) + encoding = result["encoding"] + + # Fallback to utf-8 if detection fails + if not encoding: + encoding = "utf-8" + + json_data = json.loads(file_content.decode(encoding, errors="ignore")) return json.dumps(json_data, indent=2, ensure_ascii=False) - except (UnicodeDecodeError, json.JSONDecodeError) as e: - raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e + except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e: + # If decoding fails, try with utf-8 as last resort + try: + json_data = json.loads(file_content.decode("utf-8", errors="ignore")) + return json.dumps(json_data, indent=2, ensure_ascii=False) + except (UnicodeDecodeError, json.JSONDecodeError): + raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e def _extract_text_from_yaml(file_content: bytes) -> str: """Extract the content from yaml file""" try: - yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore")) + # Detect encoding using chardet + result = chardet.detect(file_content) + encoding = result["encoding"] + + # Fallback to utf-8 if detection fails + if not encoding: + encoding = "utf-8" + + yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore")) return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) - except (UnicodeDecodeError, yaml.YAMLError) as e: - raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e + except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e: + # If decoding fails, try with utf-8 as last resort + try: + yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore")) + return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) + except (UnicodeDecodeError, yaml.YAMLError): + raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e def _extract_text_from_pdf(file_content: bytes) -> str: @@ -338,7 +377,20 @@ def _extract_text_from_file(file: File): def _extract_text_from_csv(file_content: bytes) -> str: try: - csv_file = io.StringIO(file_content.decode("utf-8", "ignore")) + # Detect encoding using chardet + result = chardet.detect(file_content) + encoding = result["encoding"] + + # Fallback to utf-8 if detection fails + if not encoding: + encoding = "utf-8" + + try: + csv_file = io.StringIO(file_content.decode(encoding, errors="ignore")) + except (UnicodeDecodeError, LookupError): + # If decoding fails, try with utf-8 as last resort + csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore")) + csv_reader = csv.reader(csv_file) rows = list(csv_reader) diff --git a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py index 1e8aec7f88..6d46ea9b89 100644 --- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py +++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py @@ -150,7 +150,7 @@ def test_extract_text_from_plain_text_non_utf8(): temp_file.write(non_utf8_content) temp_file.seek(0) text = _extract_text_from_plain_text(temp_file.read()) - assert text == "Hello, world." + assert text == "Hello, world©." @patch("pypdfium2.PdfDocument")