fix wrong charset when decoding Chinese content (#6774)

Co-authored-by: zhangwb <zhangwb@zts.com.cn>
This commit is contained in:
eric-0x72 2024-07-30 21:32:45 +08:00 committed by GitHub
parent 53a89bbbc7
commit 98d9837fbc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -10,6 +10,7 @@ import unicodedata
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import unquote from urllib.parse import unquote
import chardet
import cloudscraper import cloudscraper
from bs4 import BeautifulSoup, CData, Comment, NavigableString from bs4 import BeautifulSoup, CData, Comment, NavigableString
from regex import regex from regex import regex
@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str:
if response.status_code != 200: if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code) return "URL returned status code {}.".format(response.status_code)
a = extract_using_readabilipy(response.text) # Detect encoding using chardet
detected_encoding = chardet.detect(response.content)
encoding = detected_encoding['encoding']
if encoding:
try:
content = response.content.decode(encoding)
except (UnicodeDecodeError, TypeError):
content = response.text
else:
content = response.text
a = extract_using_readabilipy(content)
if not a['plain_text'] or not a['plain_text'].strip(): if not a['plain_text'] or not a['plain_text'].strip():
return '' return ''