feat: document extractor chardet encoding (#20269)

Signed-off-by: -LAN- <laipz8200@outlook.com>
This commit is contained in:
-LAN- 2025-05-27 13:27:46 +08:00 committed by GitHub
parent 756f35f480
commit 9c9d3d7bd0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 63 additions and 11 deletions

View File

@ -7,6 +7,7 @@ import tempfile
from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
from typing import Any, cast from typing import Any, cast
import chardet
import docx import docx
import pandas as pd import pandas as pd
import pypandoc # type: ignore import pypandoc # type: ignore
@ -180,26 +181,64 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
def _extract_text_from_plain_text(file_content: bytes) -> str: def _extract_text_from_plain_text(file_content: bytes) -> str:
try: try:
return file_content.decode("utf-8", "ignore") # Detect encoding using chardet
except UnicodeDecodeError as e: result = chardet.detect(file_content)
raise TextExtractionError("Failed to decode plain text file") from e encoding = result["encoding"]
# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"
return file_content.decode(encoding, errors="ignore")
except (UnicodeDecodeError, LookupError) as e:
# If decoding fails, try with utf-8 as last resort
try:
return file_content.decode("utf-8", errors="ignore")
except UnicodeDecodeError:
raise TextExtractionError(f"Failed to decode plain text file: {e}") from e
def _extract_text_from_json(file_content: bytes) -> str: def _extract_text_from_json(file_content: bytes) -> str:
try: try:
json_data = json.loads(file_content.decode("utf-8", "ignore")) # Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"
json_data = json.loads(file_content.decode(encoding, errors="ignore"))
return json.dumps(json_data, indent=2, ensure_ascii=False) return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError) as e: except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e:
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e # If decoding fails, try with utf-8 as last resort
try:
json_data = json.loads(file_content.decode("utf-8", errors="ignore"))
return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError):
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
def _extract_text_from_yaml(file_content: bytes) -> str: def _extract_text_from_yaml(file_content: bytes) -> str:
"""Extract the content from yaml file""" """Extract the content from yaml file"""
try: try:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore")) # Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"
yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore"))
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
except (UnicodeDecodeError, yaml.YAMLError) as e: except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e:
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e # If decoding fails, try with utf-8 as last resort
try:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore"))
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
except (UnicodeDecodeError, yaml.YAMLError):
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
def _extract_text_from_pdf(file_content: bytes) -> str: def _extract_text_from_pdf(file_content: bytes) -> str:
@ -338,7 +377,20 @@ def _extract_text_from_file(file: File):
def _extract_text_from_csv(file_content: bytes) -> str: def _extract_text_from_csv(file_content: bytes) -> str:
try: try:
csv_file = io.StringIO(file_content.decode("utf-8", "ignore")) # Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"
try:
csv_file = io.StringIO(file_content.decode(encoding, errors="ignore"))
except (UnicodeDecodeError, LookupError):
# If decoding fails, try with utf-8 as last resort
csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore"))
csv_reader = csv.reader(csv_file) csv_reader = csv.reader(csv_file)
rows = list(csv_reader) rows = list(csv_reader)

View File

@ -150,7 +150,7 @@ def test_extract_text_from_plain_text_non_utf8():
temp_file.write(non_utf8_content) temp_file.write(non_utf8_content)
temp_file.seek(0) temp_file.seek(0)
text = _extract_text_from_plain_text(temp_file.read()) text = _extract_text_from_plain_text(temp_file.read())
assert text == "Hello, world." assert text == "Hello, world©."
@patch("pypdfium2.PdfDocument") @patch("pypdfium2.PdfDocument")