From 98d9837fbc78d21d96e6a2e1d608c818040ebc51 Mon Sep 17 00:00:00 2001 From: eric-0x72 <1466870240@qq.com> Date: Tue, 30 Jul 2024 21:32:45 +0800 Subject: [PATCH] fix wrong charset when decoding Chinese content (#6774) Co-authored-by: zhangwb --- api/core/tools/utils/web_reader_tool.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py index f6f04271d6..a461328ae6 100644 --- a/api/core/tools/utils/web_reader_tool.py +++ b/api/core/tools/utils/web_reader_tool.py @@ -10,6 +10,7 @@ import unicodedata from contextlib import contextmanager from urllib.parse import unquote +import chardet import cloudscraper from bs4 import BeautifulSoup, CData, Comment, NavigableString from regex import regex @@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str: if response.status_code != 200: return "URL returned status code {}.".format(response.status_code) - a = extract_using_readabilipy(response.text) + # Detect encoding using chardet + detected_encoding = chardet.detect(response.content) + encoding = detected_encoding['encoding'] + if encoding: + try: + content = response.content.decode(encoding) + except (UnicodeDecodeError, TypeError): + content = response.text + else: + content = response.text + + a = extract_using_readabilipy(content) if not a['plain_text'] or not a['plain_text'].strip(): return ''