fix wrong charset when decoding Chinese content (#6774)

Co-authored-by: zhangwb <zhangwb@zts.com.cn>
This commit is contained in:
eric-0x72 2024-07-30 21:32:45 +08:00 committed by GitHub
parent 53a89bbbc7
commit 98d9837fbc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -10,6 +10,7 @@ import unicodedata
from contextlib import contextmanager
from urllib.parse import unquote
import chardet
import cloudscraper
from bs4 import BeautifulSoup, CData, Comment, NavigableString
from regex import regex
@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str:
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
a = extract_using_readabilipy(response.text)
# Detect encoding using chardet
detected_encoding = chardet.detect(response.content)
encoding = detected_encoding['encoding']
if encoding:
try:
content = response.content.decode(encoding)
except (UnicodeDecodeError, TypeError):
content = response.text
else:
content = response.text
a = extract_using_readabilipy(content)
if not a['plain_text'] or not a['plain_text'].strip():
return ''