mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 04:49:04 +08:00
feat: document extractor chardet encoding (#20269)
Signed-off-by: -LAN- <laipz8200@outlook.com>
This commit is contained in:
parent
756f35f480
commit
9c9d3d7bd0
@ -7,6 +7,7 @@ import tempfile
|
|||||||
from collections.abc import Mapping, Sequence
|
from collections.abc import Mapping, Sequence
|
||||||
from typing import Any, cast
|
from typing import Any, cast
|
||||||
|
|
||||||
|
import chardet
|
||||||
import docx
|
import docx
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pypandoc # type: ignore
|
import pypandoc # type: ignore
|
||||||
@ -180,26 +181,64 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
|
|||||||
|
|
||||||
def _extract_text_from_plain_text(file_content: bytes) -> str:
|
def _extract_text_from_plain_text(file_content: bytes) -> str:
|
||||||
try:
|
try:
|
||||||
return file_content.decode("utf-8", "ignore")
|
# Detect encoding using chardet
|
||||||
except UnicodeDecodeError as e:
|
result = chardet.detect(file_content)
|
||||||
raise TextExtractionError("Failed to decode plain text file") from e
|
encoding = result["encoding"]
|
||||||
|
|
||||||
|
# Fallback to utf-8 if detection fails
|
||||||
|
if not encoding:
|
||||||
|
encoding = "utf-8"
|
||||||
|
|
||||||
|
return file_content.decode(encoding, errors="ignore")
|
||||||
|
except (UnicodeDecodeError, LookupError) as e:
|
||||||
|
# If decoding fails, try with utf-8 as last resort
|
||||||
|
try:
|
||||||
|
return file_content.decode("utf-8", errors="ignore")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
raise TextExtractionError(f"Failed to decode plain text file: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
def _extract_text_from_json(file_content: bytes) -> str:
|
def _extract_text_from_json(file_content: bytes) -> str:
|
||||||
try:
|
try:
|
||||||
json_data = json.loads(file_content.decode("utf-8", "ignore"))
|
# Detect encoding using chardet
|
||||||
|
result = chardet.detect(file_content)
|
||||||
|
encoding = result["encoding"]
|
||||||
|
|
||||||
|
# Fallback to utf-8 if detection fails
|
||||||
|
if not encoding:
|
||||||
|
encoding = "utf-8"
|
||||||
|
|
||||||
|
json_data = json.loads(file_content.decode(encoding, errors="ignore"))
|
||||||
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
||||||
except (UnicodeDecodeError, json.JSONDecodeError) as e:
|
except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e:
|
||||||
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
# If decoding fails, try with utf-8 as last resort
|
||||||
|
try:
|
||||||
|
json_data = json.loads(file_content.decode("utf-8", errors="ignore"))
|
||||||
|
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
||||||
|
except (UnicodeDecodeError, json.JSONDecodeError):
|
||||||
|
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
def _extract_text_from_yaml(file_content: bytes) -> str:
|
def _extract_text_from_yaml(file_content: bytes) -> str:
|
||||||
"""Extract the content from yaml file"""
|
"""Extract the content from yaml file"""
|
||||||
try:
|
try:
|
||||||
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
|
# Detect encoding using chardet
|
||||||
|
result = chardet.detect(file_content)
|
||||||
|
encoding = result["encoding"]
|
||||||
|
|
||||||
|
# Fallback to utf-8 if detection fails
|
||||||
|
if not encoding:
|
||||||
|
encoding = "utf-8"
|
||||||
|
|
||||||
|
yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore"))
|
||||||
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
|
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
|
||||||
except (UnicodeDecodeError, yaml.YAMLError) as e:
|
except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e:
|
||||||
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
|
# If decoding fails, try with utf-8 as last resort
|
||||||
|
try:
|
||||||
|
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore"))
|
||||||
|
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
|
||||||
|
except (UnicodeDecodeError, yaml.YAMLError):
|
||||||
|
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
def _extract_text_from_pdf(file_content: bytes) -> str:
|
def _extract_text_from_pdf(file_content: bytes) -> str:
|
||||||
@ -338,7 +377,20 @@ def _extract_text_from_file(file: File):
|
|||||||
|
|
||||||
def _extract_text_from_csv(file_content: bytes) -> str:
|
def _extract_text_from_csv(file_content: bytes) -> str:
|
||||||
try:
|
try:
|
||||||
csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
|
# Detect encoding using chardet
|
||||||
|
result = chardet.detect(file_content)
|
||||||
|
encoding = result["encoding"]
|
||||||
|
|
||||||
|
# Fallback to utf-8 if detection fails
|
||||||
|
if not encoding:
|
||||||
|
encoding = "utf-8"
|
||||||
|
|
||||||
|
try:
|
||||||
|
csv_file = io.StringIO(file_content.decode(encoding, errors="ignore"))
|
||||||
|
except (UnicodeDecodeError, LookupError):
|
||||||
|
# If decoding fails, try with utf-8 as last resort
|
||||||
|
csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore"))
|
||||||
|
|
||||||
csv_reader = csv.reader(csv_file)
|
csv_reader = csv.reader(csv_file)
|
||||||
rows = list(csv_reader)
|
rows = list(csv_reader)
|
||||||
|
|
||||||
|
@ -150,7 +150,7 @@ def test_extract_text_from_plain_text_non_utf8():
|
|||||||
temp_file.write(non_utf8_content)
|
temp_file.write(non_utf8_content)
|
||||||
temp_file.seek(0)
|
temp_file.seek(0)
|
||||||
text = _extract_text_from_plain_text(temp_file.read())
|
text = _extract_text_from_plain_text(temp_file.read())
|
||||||
assert text == "Hello, world."
|
assert text == "Hello, world©."
|
||||||
|
|
||||||
|
|
||||||
@patch("pypdfium2.PdfDocument")
|
@patch("pypdfium2.PdfDocument")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user